# Data preparation

In [None]:
import pandas as pd
import os
import helpers
import numpy as np
from settings import data_folder,preprocessed_folder


From regbl, load link between ID building and ID of the municipality number (BFS).

In [None]:
bfs_number = pd.read_csv(
    os.path.join(data_folder, "ch", "gebaeude_batiment_edificio.csv"),
    sep="\t",
    usecols=[0, 2, 3],
    dtype={"EGID": "category", "GGDENR": "category", "GGDENAME": "category"},
)


Load also mapping between postcode and BFS number. The mapping is not perfect, since we do not have a perfect matching, but we take the number that is more represented.

In [None]:
map_plz_bfs = (
    pd.read_excel(os.path.join(data_folder, "mapping_plz_bfs.xlsx"), sheet_name="PLZ4")
    .rename(columns={"PLZ4": "PLZ", "KTKZ": "GGDENR", "GDENAMK": "GGDENAME"})
    .astype({"PLZ": "category", "GGDENR": str, "GGDENAME": "category"})
    .astype({"GGDENR": "category"})
)
# Find the most representative municipality
map_plz_bfs = map_plz_bfs.loc[map_plz_bfs.groupby("PLZ")["%_IN_GDE"].idxmax()]
map_plz_bfs = map_plz_bfs.drop(columns=["%_IN_GDE", "GDENR"])
map_plz_bfs


Load the SFOE building renovation database

In [None]:
db = helpers.load_database_init()
# Consider only payments that are labelled as "FIX"
db = db[db["Status"] == "FIX"]
db = db.drop(columns=["Status"])
# Doesn't match any number in Regbl
to_replace = {
    "EGID": {
        "999": np.nan,
        "99999": np.nan,
        "9999999": np.nan,
        "99999999": np.nan,
        "999999999": np.nan,
        "-99": np.nan,
        "1": np.nan,
    }
}
db = db.replace(to_replace)


In [None]:
# First use the RegBL to obtain the BFS number
tmp = pd.merge(db, bfs_number, on="EGID", how="left")


Check which building EGID is not present in the RegBL and that do not contain ";" (corresponds to several buildings separeted with ;)

In [None]:
print("How many buildings do not have any EGID in the database ?",len(db[db["EGID"].isnull()]))
# Uncomment to see which
# db_no_missing = db[~db.EGID.isna()]
# db_no_missing[
#     (~db_no_missing.EGID.isin(bfs_number.EGID))
#     & (~db_no_missing.EGID.str.contains(";"))
# ]


For missing values, fill using the mapping between the postal address and the BFS number

In [None]:
mask = tmp.GGDENR.isna()
add_postal_code = pd.merge(
    tmp[mask].drop(columns=["GGDENR", "GGDENAME"]),
    map_plz_bfs,
    on="PLZ",
    how="inner",
)
tmp = pd.concat(
    (
        tmp[~mask],
        add_postal_code,
    )
)


Adding the typology of the municipality and whether the municipality is considered to be alpine or not.

In [None]:
# Map BFS number (GGDENR) to urban/rural/midland/alpine regions
path = os.path.join(data_folder, "alpin.xlsx")
renaming = {
    "Alpine": {
        "Communes hors des régions de montagne": "No",
        "Communes des régions de montagne": "Yes",
    }
}
alpine = (
    helpers.read_xlsx_from_atlas(path, nrows=2212)
    .rename(columns={"Classification des communes": "Alpine"})
    .replace(renaming)
)

path = os.path.join(data_folder, "urbain.xlsx")
renaming = {
    "Typology": {
        "Rural (3)": "Rural",
        "Intermédiaire (2)": "Intermediate",
        "Urbain (1)": "Urban",
    }
}
typology = (
    helpers.read_xlsx_from_atlas(path, nrows=2255)
    .rename(columns={"Catégories": "Typology"})
    .replace(renaming)
)


But first we need to handle the fusion of the municipalities....

If two municipalities are alpine and non-alpine, the fusion of the two municipalitiy is considered to be alpine.

In [None]:
def merging_alpin(to_, from_):
    return "No"

# Aggregate data using the new municipalities
alpine = helpers.mapping_com(alpine, merging_alpin).drop(columns="Regionsname")
alpine.to_csv(os.path.join(preprocessed_folder, "alpin_fusion_com.csv"))


Merging the typologies. If two municipalities are urban and intermediate, the fusion of the two municipalitiy is considered to be urban. Similarly, if two municipalities are rural and intermediate, the fusion of the two municipalitiy is considered to be intermediate.

In [None]:
def merging_typology(to_, from_):
    old_typ = from_["Typology"]
    new_typ = to_["Typology"]
    if old_typ == "Urban" or new_typ == "Urban":
        return "Urban"
    elif old_typ == "Intermediate" or new_typ == "Intermediate":
        return "Intermediate"
    else:
        raise ValueError((old_typ, new_typ))


typology = helpers.mapping_com(typology, merging_typology).drop(columns="Regionsname")
typology.to_csv(os.path.join(preprocessed_folder, "typology_fusion_com.csv"))


In [None]:
class_terrain = pd.merge(
    typology, alpine, right_index=True, left_index=True
).reset_index()
class_terrain["Regions-ID"] = class_terrain["Regions-ID"].astype("category")


In [None]:
# Manually tracking the municipalities that are merged together
# dict-like file indicating the changes in the GDE code of newly merged municipalities
mapping_commune = (
    pd.read_csv(os.path.join(data_folder, "mapping_commune.csv"))
    .astype({"From": "category", "To": "category"})
    .set_index("From")
    .to_dict()["To"]
)
tmp.GGDENR = tmp.GGDENR.astype("int").replace(mapping_commune).astype("category")
db_with_terrain_class = pd.merge(
    tmp, class_terrain, left_on="GGDENR", right_on="Regions-ID"
)


In [None]:
# Save to data folder
db_with_terrain_class.to_csv(
    os.path.join(preprocessed_folder, "db_with_terrain_class.csv"), index=False
)


Updating the socio-economic features to take into account fusion of municipalities

In [None]:
socio_features = pd.read_csv(os.path.join(data_folder, "combined.csv"))
# Remove CH
socio_features = socio_features[socio_features["BFS_NUMMER"] != "CH"]
socio_features = socio_features.astype({"BFS_NUMMER": int})
# Replace number with the newest number
socio_features = socio_features.replace({"BFS_NUMMER": mapping_commune})


In [None]:
# Merge by BFS_NUMMER
socio_features_commune_updated = helpers.combined_rows_db(
    socio_features, col="BFS_NUMMER"
).drop(
    columns=[
        "Revenu_nb_contribuable",
        "Revenu_nb_habitant",
        "hab_old",
        "surf_hab_old",
        "surf_agr_old",
    ]
)
socio_features_commune_updated


In [None]:
socio_features_commune_updated.to_csv(
    os.path.join(preprocessed_folder, "socio_economic.csv"), index=False
)


## Preparation RegBL

In [None]:
name = helpers.get_code_translation_regbl()
regbl = helpers.prepare_regbl()
# Removed destroyed buildings
regbl = regbl[(regbl["Annee_destr"].isnull())].copy()
regbl.drop(columns=["Annee_destr"], inplace=True)


Rename codes in each column

In [None]:
dtype = {"EGID": int, "WSTWK": "category", "WSTAT": "category"}
# Load RegBL
hab = pd.read_csv(
    os.path.join(data_folder, "ch", "wohnung_logement_abitazione.csv"),
    sep="\t",
    dtype=dtype,
    usecols=["EGID", "WSTWK", "WMEHRG", "WSTAT", "WAREA", "WAZIM", "WKCHE"],
)
# Replace codes
for x in hab.columns:
    try:
        rename_codes = name.xs(x, level=1, drop_level=True).CODTXTKF.to_dict()
    except KeyError:
        continue
    hab.replace({x: rename_codes}, inplace=True)
hab = hab[hab.WSTAT == "existant"].copy()


In [None]:
# Get superficie and nb of rooms/EGID
nb_rooms = (
    pd.merge(regbl, hab, on="EGID", how="left")
    .groupby("EGID")
    .agg({"WAREA": "sum", "WAZIM": "sum"})
    .reset_index()
)
regbl = pd.merge(
    regbl,
    nb_rooms,
    on="EGID",
    how="left",
)
regbl.loc[regbl.WAREA == 0, "WAREA"] = np.nan
regbl.loc[regbl.WAZIM == 0, "WAZIM"] = np.nan
# Save file
regbl.reset_index(drop=True).to_pickle(os.path.join(preprocessed_folder, "rebgl.pickle"))
