In [None]:
import recordlinkage as rl
import pandas as pd
import geopandas as gpd
import numpy as np
import maskmypy
from recordlinkage.preprocessing import clean, phonetic
from shapely.geometry import Point, Polygon
from pathlib import Path
from difflib import SequenceMatcher
from maskmypy import Donut
import pickle
from geopandas import GeoDataFrame, sjoin
from random import random, gauss, uniform
from shapely.affinity import translate
from math import sqrt
from maskmypy import Donut_MaxK
from maskmypy import Street
import pandarallel
import contextily as ctx
data_folder = Path('../Data/')

In [None]:
sim_df = pd.DataFrame(np.array([[523,523,523, 982, 982, 982],['LAMAL','LAMAL','LAMAL','LCA','LCA','LCA'], [1,2,3,1,2,3], [46.1,45.1,44.1, 46.11, 45.11, 44.11],[6.1, 6.2, 6.3, 6.11,6.12,6.13]])).T

In [None]:
sim_df.columns = ['ID','category','address_id','lon','lat']

In [None]:
sim_df_LCA = sim_df[sim_df.category == 'LCA']
sim_df_LAMAL = sim_df[sim_df.category == 'LAMAL']

In [None]:
sim_df_LCA['matching_ID'] = '523-982'
sim_df_LAMAL['matching_ID'] = '523-982'

In [None]:
pd.merge(sim_df_LAMAL,sim_df_LCA, on = 'matching_ID')

In [None]:
pd.merge(sim_df_LAMAL,sim_df_LCA, on = ['matching_ID','address_id'])

In [None]:
def make_gdf(df,crs,x,y):
    geometry = [Point(xy) for xy in zip(df[x], df[y])]
    crs ='epsg:{}'.format(crs)
    gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    return gdf

In [None]:
country_geo = gpd.read_file(data_folder/'raw'/"/g2l15.shp").to_crs(2056)

In [None]:
communes = gpd.read_file(data_folder/'raw/swissBOUNDARIES3D_1_3_TLM_HOHEITSGEBIET.shp')
communes = communes.to_crs(2056)
communes = communes[~communes.geometry.isnull()]
communes = communes.rename(columns={'geom': 'geometry'})
communes = communes[communes.NAME != 'Lac Léman (VD)']
communes = communes[communes.NAME != 'Lac de Neuchâtel (VD)']
communes = communes[communes.NAME != 'Lac de Morat (VD)']
communes = communes.reset_index(drop=True)
communes = gpd.GeoDataFrame(communes, crs = 2056,geometry=communes['geometry'])

In [None]:
def convert_3D_2D(geometry):
    '''
    Takes a GeoSeries of 3D Multi/Polygons (has_z) and returns a list of 2D Multi/Polygons
    '''
    new_geo = []
    for p in geometry:
        if p.has_z:
            if p.geom_type == 'Polygon':
                lines = [xy[:2] for xy in list(p.exterior.coords)]
                new_p = Polygon(lines)
                new_geo.append(new_p)
            elif p.geom_type == 'MultiPolygon':
                new_multi_p = []
                for ap in p:
                    lines = [xy[:2] for xy in list(ap.exterior.coords)]
                    new_p = Polygon(lines)
                    new_multi_p.append(new_p)
                new_geo.append(MultiPolygon(new_multi_p))
    return new_geo

communes['geometry'] = convert_3D_2D(communes['geometry'])

communes.crs = 2056

In [None]:
regbl_address = pd.read_feather(data_folder/'Clean_data/regbl_address.feather')
regbl_address[['gkode','gkodn']] = regbl_address[['gkode','gkodn']].astype(float)
regbl_address = make_gdf(regbl_address,'2056','gkode','gkodn')
regbl_address['address'] = regbl_address['address'].str[:-3]

In [None]:
sample_address = regbl_address.sort_values(['gdekt','gdename','strname']).head(100000).reset_index(drop = True)

In [None]:
statpop = pd.read_csv(data_folder/'OFS/ag-b-00.03-vz2020statpop/STATPOP2020.csv',sep = ';')
statpop_ha = statpop.copy()
geometry = [Point(xy) for xy in zip(statpop['E_KOORD'], statpop['N_KOORD'])]
statpop_point = gpd.GeoDataFrame(statpop, crs=2056, geometry=geometry)   

In [None]:
geometry = [Polygon(zip([xy[0],xy[0],xy[0]+100,xy[0]+100],[xy[1],xy[1]+100,xy[1]+100,xy[1]])) for xy in zip(statpop_ha.E_KOORD, statpop_ha.N_KOORD)]
statpop_ha = gpd.GeoDataFrame(statpop_ha, crs=2056, geometry=geometry)      

In [None]:
population_gdf = statpop_ha[['B20BTOT','geometry']]

In [None]:
df_rivers = gpd.read_file(data_folder/'raw'/'Lacs'/'Typisierung_LV95'/'typisierung.gpkg').to_crs(2056)
df_lakes = gpd.read_file(data_folder/'raw'/'Lacs'/"g2s15.shp").to_crs(2056)

In [None]:
df_rivers_polygons = df_rivers.copy()
df_rivers_polygons.loc[df_rivers_polygons.GROSSERFLUSS != 'NA','geometry'] = df_rivers_polygons['geometry'].buffer(30)
df_rivers_polygons.loc[df_rivers_polygons.GROSSERFLUSS == 'NA','geometry'] = df_rivers_polygons['geometry'].buffer(5)

In [None]:
df_lakes_and_rivers = pd.concat([df_rivers_polygons[['geometry']], df_lakes[['geometry']]])
df_lakes_and_rivers_union = df_lakes_and_rivers['geometry'].unary_union
country_geo_wo_rivers_lakes = country_geo.difference(df_lakes_and_rivers_union)
country_geo_wo_rivers_lakes = gpd.GeoDataFrame(country_geo_wo_rivers_lakes, columns = ['geometry'])
df_lakes_and_rivers.plot(figsize = (12,12))

## Geomasking - LCA

In [None]:
df_lca = pd.read_csv('df_lca.csv')
df_lca_nonull = df_lca[df_lca.comment.str.contains('Ok')]

In [None]:
donutmask = Donut_MaxK(
    df_lca_nonull, # Name of the sensitive geodataframe
    population_gdf=communes[communes.EINWOHNERZ.isnull()==False], # Name of the census geodataframe
    population_column='EINWOHNERZ', # Name of the column containing the population field
    max_k_anonymity=100, # The maximum possible k-anonymity value
    donut_ratio=0.05, # The ratio used to define the minimum possible k-anonymity value.
    distribution='uniform' # The distribution to use when displacing points. Other options include 'gaussian' and 'areal'. 'Areal' distribution means points are more likely to be displaced further within the range.
) # Optional, a geodataframe used to ensure that points do not leave a particular area. 

donutmask.execute()

masked_gdf_lca = donutmask.masked

In [None]:
masked_gdf.radius_max.max()

In [None]:
masked_gdf_lca['lon_masked'] = masked_gdf_lca['geometry'].to_crs(4326).x
masked_gdf_lca['lat_masked'] = masked_gdf_lca['geometry'].to_crs(4326).y

In [None]:
masked_gdf_lca['address'] = masked_gdf_lca['full_address']
masked_gdf_lca.loc[masked_gdf.new_address.isnull()==False, 'address'] = masked_gdf_lca.new_address
masked_gdf_lca['address'] = pd.Categorical(masked_gdf_lca['address'])
masked_gdf_lca['address_id'] = masked_gdf_lca['address'].cat.codes.astype(int)
masked_gdf_lca['address'] = masked_gdf_lca['address'].astype('string')

In [None]:
#Export
masked_gdf.to_csv(data_folder/'Clean_data'/'masked_lca_nonull.csv',index = False)

In [None]:
#Get rid of NATION duplicates by aggregating, CH if contains CH, otherwise max
agg_nation = df_lca.groupby('ID_LCA')['NATION'].apply(lambda x: 'CH' if 'CH' in x else max(x))

df_lca['NATION_NODUP'] = df_lca['ID_LCA'].map(agg_nation.to_dict())

#Get rid of SEXE duplicates by aggregating, max
agg_sexe = df_lca.groupby('ID_LCA')['SEXE'].apply(lambda x: max(x))

df_lca['SEXE_NODUP'] = df_lca['ID_LCA'].map(agg_sexe.to_dict())

In [None]:
lca_mask_link = pd.merge(lca[['ID_LCA','ANNEE_NAISSANCE','MOIS_NAISSANCE','SEXE_NODUP','NATION_NODUP']].drop_duplicates(), masked_gdf[['ID_LCA','lon_masked','lat_masked','zipcode','address_id']], on = 'ID_LCA')

In [None]:
lca_mask_link= lca_mask_link.sort_values('lon_masked').drop_duplicates(subset= ['ID_LCA','address_id'],keep = 'first')

## Geomasking - LAMAL

In [None]:
df_lamal = pd.read_csv('df_lamal_80.csv')
df_lamal_nonull = df_lamal[df_lamal.comment.str.contains('Ok')]

In [None]:
donutmask = Donut_MaxK(
    df_lamal_nonull, # Name of the sensitive geodataframe
    population_gdf=communes[communes.EINWOHNERZ.isnull()==False], # Name of the census geodataframe
    population_column='EINWOHNERZ', # Name of the column containing the population field
    max_k_anonymity=100, # The maximum possible k-anonymity value
    donut_ratio=0.05, # The ratio used to define the minimum possible k-anonymity value.
    distribution='uniform' # The distribution to use when displacing points. Other options include 'gaussian' and 'areal'. 'Areal' distribution means points are more likely to be displaced further within the range.
) # Optional, a geodataframe used to ensure that points do not leave a particular area. 

donutmask.execute()

masked_gdf_lamal = donutmask.masked

In [None]:
masked_gdf_lamal['lon_masked'] = masked_gdf_lamal['geometry'].to_crs(4326).x
masked_gdf_lamal['lat_masked'] = masked_gdf_lamal['geometry'].to_crs(4326).y

In [None]:
masked_gdf_lamal['address'] = masked_gdf_lamal['full_address']
masked_gdf_lamal.loc[masked_gdf_lamal.new_address.isnull()==False, 'address'] = masked_gdf_lamal.new_address
masked_gdf_lamal['address_id'] = masked_gdf_lamal['address'].map(key_address_id)

In [None]:
key_address_id = masked_gdf_lca[['address','address_id']].set_index('address').to_dict()['address_id']

In [None]:
df_lamal['NATION'] = df_lamal['NATION'].astype(str)

In [None]:
#Get rid of NATION duplicates by aggregating, CH if contains CH, otherwise max
agg_nation = df_lamal.groupby('ID_LAMAL')['NATION'].apply(lambda x: 'CH' if 'CH' in x else max(x))
df_lamal['NATION_NODUP'] = df_lamal['ID_LAMAL'].map(agg_nation.to_dict())
#Get rid of SEXE duplicates by aggregating, max
agg_sexe = df_lamal.groupby('ID_LAMAL')['SEXE'].apply(lambda x: max(x))
df_lamal['SEXE_NODUP'] = df_lamal['ID_LAMAL'].map(agg_sexe.to_dict())

In [None]:
lamal_mask_link = pd.merge(df_lamal[['ID_LAMAL','ANNEE_NAISSANCE','MOIS_NAISSANCE','SEXE_NODUP','NATION_NODUP']].drop_duplicates(), masked_gdf_lamal[['ID_LAMAL','lon_masked','lat_masked','zipcode','address_id']], on = 'ID_LAMAL')

In [None]:
lamal_mask_link = lamal_mask_link.sort_values('lon_masked').drop_duplicates(subset= ['ID_LAMAL','address_id'],keep = 'first')

In [None]:
df_lamal.ID_LAMAL.nunique()

In [None]:
lca_mask_link.ID_LCA.nunique()

In [None]:
lamal_mask_link.ID_LAMAL.nunique()

In [None]:
lca_mask_link.to_csv(data_folder/'Clean_data'/'lca_masked_for_linkage.csv',index = False)

In [None]:
lamal_mask_link.to_csv(data_folder/'Clean_data'/'lamal_masked_for_linkage.csv',index = False)

## Record linkage

In [None]:
indexer = rl.Index()
#champ_dict = {'NOANNEE':'NOANNEE','ID_LCA':'ID_LCA','ANNEE_NAISSANCE':'ANNEE_NAISSANCE','mois_mod2':'MOIS_NAISSANCE','CDPHYSSEXE':'SEXE','CDPHYSNATIONALITE':'NATION','TXCOMPLEMENTDESTLEGALE':'COMP_DEST_LEGAL','TXRUELEGALE':'street','TXRUENUMEROLEGALE':'adr_num','TXNPALEGALE':'zipcode','TXLOCALITELEGALE':'city'}
indexer.block(['ANNEE_NAISSANCE','MOIS_NAISSANCE','SEXE_NODUP','NATION_NODUP','zipcode'])
pairs = indexer.index(lca_mask_link, lamal_mask_link)

In [None]:
print (len(lca_mask_link), len(lamal_mask_link), len(pairs))

In [None]:
n_cpu = 10 #Set number of CPUs

In [None]:
comparer = rl.Compare(n_jobs=n_cpu)

In [None]:
#250m (max geomasking) - 25min (min) / 2 (0.5 decay of the linear fct)
#Because it is the min distance of geomasking
# comparer.exact('given_name', 'given_name', label='given_name')
# comparer.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
# comparer.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
comparer.string('NATION_NODUP', 'NATION_NODUP',method='jarowinkler', threshold=0.85, label='NATION')
comparer.exact('ANNEE_NAISSANCE', 'ANNEE_NAISSANCE', label='ANNEE_NAISSANCE')
comparer.exact('MOIS_NAISSANCE', 'MOIS_NAISSANCE', label='MOIS_NAISSANCE')
comparer.exact('SEXE_NODUP', 'SEXE_NODUP', label='SEXE')
comparer.exact('zipcode', 'zipcode', label='zipcode')
comparer.geo(left_on_lat = 'lat_masked',left_on_lng = 'lon_masked',right_on_lat = 'lat_masked',right_on_lng = 'lon_masked',scale = 0.04, offset = 0.5, method = 'linear',label = 'distance')
features = comparer.compute(pairs, lca_mask_link, lamal_mask_link)

In [None]:
features.describe()

In [None]:
features = pd.read_csv(data_folder/'features_w_zipcode.csv')

In [None]:
features = features.set_index(['ID_LAMAL','ID_LCA'])

In [None]:
# Sum the comparison results
features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
features[features.distance == 0]

In [None]:
features.distance.plot.kde()

In [None]:
?rl.ECMClassifier

In [None]:
cl = rl.ECMClassifier(binarize = 0.5)
cl.fit(features)

In [None]:
# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(features)
print("Predicted number of links:", len(links_pred))

In [None]:
# Predict the match probability for each pair in the dataset.
probs = cl.prob(features)
print(probs)

In [None]:
features = features.reset_index()

In [None]:
features.ID_LCA.nunique()

In [None]:
matches = features[features.sum(axis=1) > 5.999].reset_index()

In [None]:
import statsmodels.api as sm

In [None]:
df_probs = pd.DataFrame(probs).reset_index()
df_probs.columns = ['id_lca','id_lamal','prob']

key_id_index_lamal = lamal_mask_link['ID_LAMAL'].astype(int).to_dict()
key_id_index_lca = lca_mask_link['ID_LCA'].astype(int).to_dict()

df_probs['id_lca'] = df_probs['id_lca'].astype(int).map(key_id_index_lca)
df_probs['id_lamal'] = df_probs['id_lamal'].astype(int).map(key_id_index_lamal)

df_probs['id'] = df_probs['id_lca'].astype(str) + '-'+ df_probs['id_lamal'].astype(str)

key_probs = df_probs.set_index('id')['prob'].to_dict()

max_probs = df_probs.set_index('id_lamal').groupby(['id_lca'])['prob'].idxmax()

max_probs_df = pd.DataFrame(max_probs).reset_index()
max_probs_df.columns = ['id_lca','id_lamal']

max_probs_df['id'] = max_probs_df['id_lca'].astype(str) + '-'+ max_probs_df['id_lamal'].astype(str)

max_probs_df['prob'] = max_probs_df['id'].map(key_probs)

# max_probs_df[['id_lca']].to_csv(data_folder/'Clean_data'/'lca_list_pour_christophe.csv')
# max_probs_df[['id_lamal']].to_csv(data_folder/'Clean_data'/'lamal_list_pour_christophe.csv')

## Export end file

In [None]:
max_probs_df.to_csv(data_folder/'Clean_data'/'max_probs_w_zipcode.csv')

In [None]:
max_probs_df = pd.read_csv(data_folder/'max_probs_w_zipcode_pour_david.csv')