# Preparation for days at GM

In [None]:
from difflib import SequenceMatcher

import recordlinkage as rl
import pandas as pd
import geopandas as gpd
import numpy as np
# import maskmypy
from recordlinkage.preprocessing import clean, phonetic
# from geofeather import to_geofeather, from_geofeather
from shapely.geometry import Point, Polygon
from pathlib import Path
from difflib import SequenceMatcher
from maskmypy import Donut
import pickle
# import contextily as ctx
from geopandas import GeoDataFrame, sjoin

data_folder = Path('../Data/')

In [None]:

class Base():
    ''' Base class for masking methods '''
    def __init__(self,
                sensitive_gdf,
                population_gdf='',
                population_column='pop',
                container_gdf='',
                max_tries=1000,
                address_points_gdf=''):
        self.sensitive = sensitive_gdf.copy()
        self.crs = self.sensitive.crs
        self._load_population(population_gdf, population_column)
        self._load_container(container_gdf)
        self._load_addresses(address_points_gdf)
        self.max_tries = max_tries


    def _load_population(self, population_gdf='', population_column='pop'):
        '''Loads a geodataframe of population data for donut masking 
        and/or k-anonymity estimation.'''
        if isinstance(population_gdf, GeoDataFrame):
            assert population_gdf.crs == self.crs, "Population CRS does "\
                "not match points CRS"
            self.population = self._crop_gdf(
                population_gdf, self.sensitive)
            self.pop_column = population_column
            self.population = self.population.loc[:,['geometry', self.pop_column]]
            return True
        else:
            self.population = ''
            return False


    def _load_container(self, container_gdf):
        '''Loads a geodataframe of polygons to contain points while donut masking'''
        if isinstance(container_gdf, GeoDataFrame):
            assert container_gdf.crs == self.crs, "Container CRS does "\
                "not match points CRS"
            self.container = self._crop_gdf(
                container_gdf, self.sensitive)
            self.container = self.container.loc[:,['geometry']]
            self.container_filtered = self.container.copy()
            return True
        else:
            self.container = ''
            return False


    def _load_addresses(self, address_points_gdf):
        '''Loads geodataframe containing address data for k-anonymity calculation'''
        if isinstance(address_points_gdf, GeoDataFrame):
            assert address_points_gdf.crs == self.crs, "Address points "\
                "CRS does not match points CRS"
            self.addresses = self._crop_gdf(
                address_points_gdf, self.sensitive)
            self.addresses = self.addresses.loc[:,['geometry']]
            return True
        else:
            self.addresses = ''
            return False


    def _crop_gdf(self, target_gdf, reference_gdf):
        '''Uses spatial index to reduce an input (target) geodataframe to only that which
        intersects with a reference geodataframe'''
        bb = reference_gdf.total_bounds
        x = ((bb[2] - bb[0]) / 10)
        y = ((bb[3] - bb[1]) / 10)
        bb[0] = (bb[0] - x)
        bb[1] = (bb[1] - y)
        bb[2] = (bb[2] + x)
        bb[3] = (bb[3] + y)
        target_gdf = target_gdf.cx[bb[0]:bb[2], bb[1]:bb[3]]
        return target_gdf


    def displacement_distance(self):
        '''Calculate dispalcement distance for each point after masking.'''
        assert isinstance(self.masked, GeoDataFrame), "Data has not yet been masked"
        for index, row in self.masked.iterrows():
            old_coords = self.sensitive.at[index,'geometry']
            distance = row.geometry.distance(old_coords)
            self.masked.at[index,'distance'] = distance
        return self.masked


    def k_anonymity_estimate(self, population_gdf='', population_column='pop'):
        '''Estimates k-anoynmity based on population data.'''
        if not isinstance(self.population, GeoDataFrame):
            self._load_population(population_gdf, population_column)

        assert isinstance(self.sensitive, GeoDataFrame), "Sensitive points geodataframe is missing"
        assert isinstance(self.masked, GeoDataFrame), "Data has not yet been masked"
        assert isinstance(self.population, GeoDataFrame), "Population geodataframe is missing"

        self.population['pop_area'] = self.population.area
        
        if 'distance' not in self.masked.columns:
            self.displacement_distance()

        masked_temp = self.masked.copy()

        masked_temp['geometry'] = masked_temp.apply(
            lambda x: x.geometry.buffer(x['distance']), axis=1)

        masked_temp = self._disaggregate_population(masked_temp)

        for i in range(len(self.masked.index)):
            self.masked.at[i,'k_est'] = int(
                masked_temp.loc[masked_temp['index_2'] == i, 'pop_adjusted'].sum() - 1)
        
        return self.masked


    def k_anonymity_actual(self, address_points_gdf=''):
        '''Calculates k-anonymity based on the number of addresses closer 
        to the masked point than sensitive point'''
        if not isinstance(self.addresses, GeoDataFrame):
            self._load_addresses(address_points_gdf)
            
        assert isinstance(self.sensitive, GeoDataFrame), "Sensitive points geodataframe is missing"
        assert isinstance(self.masked, GeoDataFrame), "Data has not yet been masked"
        assert isinstance(self.addresses, GeoDataFrame), "Address points geodataframe is missing"

        if isinstance(self.addresses, GeoDataFrame) is False:
            raise Exception("Error: missing address point geodataframe.")

        if 'distance' not in self.masked.columns:
            self.displacement_distance()
        
        masked_temp = self.masked.copy()

        masked_temp['geometry'] = masked_temp.apply(
            lambda x: x.geometry.buffer(x['distance']), axis=1)

        join = sjoin(self.addresses, masked_temp, how='left')

        for i in range(len(self.masked)):
            subset = join.loc[join['index_right'] == i,:]
            self.masked.at[i,'k_actual'] = len(subset)

        return self.masked


    def _disaggregate_population(self, target_gdf):
        '''Used for estimating k-anonymity. Disaggregates population within
        buffers based on population polygon data'''
        target = target_gdf.copy()
        target = sjoin(
            target, 
            self.population, 
            how='left')

        target['index_2'] = target.index

        target.index = range(len(target.index))

        target['geometry'] = target.apply(
            lambda x: x['geometry'].intersection(
                self.population.at[x['index_right'],'geometry']), 
                axis=1)

        target['intersected_area'] = target['geometry'].area

        for i in range(len(target_gdf.index)):
            
            polygon_fragments = target.loc[target['index_2'] == i, :]
            
            for index, row in polygon_fragments.iterrows():
                area_pct = row['intersected_area'] / row['pop_area']
                target.at[index,'pop_adjusted'] = row[self.pop_column] * area_pct
        
        return target


    def _containment(self, uncontained):
        '''If a container geodataframe is loaded, checks whether or not masked 
        points are within the same containment polygon as their original locations.'''
        if 'index_right' not in self.sensitive.columns:
            self.sensitive = sjoin(self.sensitive, self.container, how='left')
            self.tries = 0

        self.container_filtered = self._crop_gdf(self.container_filtered, uncontained)

        uncontained = sjoin(uncontained, self.container_filtered, how='left')

        for index, row in uncontained.iterrows():
            if row['index_right'] == self.sensitive.iat[index, -1]:
                self.masked.at[index,'contain'] = 1

        self.tries +=1
        
        if self.tries > self.max_tries:
            for index, row in uncontained.iterrows():
                self.masked.loc[index,'contain'] = 999
            
            print(str(len(uncontained)) + " points were masked but could not be" \
                "contained. Uncontained points are listed as 999 in the 'contain' field")
        
        return True

In [None]:
from geopandas import GeoDataFrame, sjoin
from random import random, gauss, uniform
from shapely.affinity import translate
from math import sqrt

class Donut(Base):

    def __init__(
                self, 
                sensitive_gdf, 
                population_gdf='', 
                population_column='pop',
                max_distance=250,
                donut_ratio=0.1,
                distribution='uniform',
                container_gdf='',
                address_points_gdf='',
                max_tries=1000):
        
        super().__init__(
            sensitive_gdf = sensitive_gdf, 
            population_gdf = population_gdf, 
            population_column = population_column,
            container_gdf = container_gdf,
            max_tries = max_tries,
            address_points_gdf = address_points_gdf)

        self.max = max_distance
        self.distribution = distribution
        self.donut_ratio = donut_ratio
        

    def _random_xy(self, min, max):
        if self.distribution == 'uniform':
            hypotenuse = uniform(min, max)
            x = uniform(0,hypotenuse)

        elif self.distribution == 'gaussian':
            mean = (((max - min) / 2) + min)
            sigma = (((max - min) / 2) / 2.5)
            hypotenuse = gauss(mean, sigma)
            x = uniform(0, hypotenuse)

        elif self.distribution == 'areal':
            hypotenuse = 0
            while hypotenuse == 0:
                r1 = uniform(min, max)
                r2 = uniform(min, max)
                if r1 > r2:
                    hypotenuse = r1
            x = uniform(0, hypotenuse)

        else:
            raise Exception("Unknown distribution") 

        y = sqrt(hypotenuse**2 - x**2)
        
        direction = random()

        if direction < 0.25:
            x = x * -1

        elif direction < 0.5:    
            y = y * -1

        elif direction < 0.75:
            x = x * -1
            y = y * -1

        elif direction < 1:
            pass

        return (x, y)
    

    def _find_radii(self):
        self.masked.loc[:,'radius_min'] = self.max * self.donut_ratio
        self.masked.loc[:,'radius_max'] = self.max


    def _mask_within_container(self): 
        self.masked.loc[:,'contain'] = 0  
        
        while min(self.masked['contain']) == 0:
            
            uncontained = self.masked.loc[self.masked['contain'] == 0, :]
            
            for index, row in uncontained.iterrows():
                x,y = self._random_xy(row['radius_min'], row['radius_max'])
                
                self.masked.at[index, 'geometry'] = translate(
                    row['geometry'], xoff=x, yoff=y)
            
            self._containment(uncontained)

        return True


    def execute(self):
        self.masked = self.sensitive.copy()

        self._find_radii()

        self.masked['offset'] = self.masked.apply(
            lambda x: self._random_xy(
                x['radius_min'], x['radius_max']), axis=1)

        self.masked['geometry'] = self.masked.apply(
            lambda x: translate(
                x['geometry'], xoff=x['offset'][0], yoff=x['offset'][1]), axis=1)

        if isinstance(self.container, GeoDataFrame):
            self._mask_within_container()

        self.masked = self.masked.drop(['offset'], axis=1)
        
        return self.masked



class Donut_MaxK(Donut):

    def __init__(
                self, 
                sensitive_gdf, 
                population_gdf='', 
                population_column='pop',
                max_k_anonymity=0,
                donut_ratio=0.1,
                distribution='uniform',
                container_gdf='',
                address_points_gdf='',
                max_tries=1000):
        
        super().__init__(
            sensitive_gdf = sensitive_gdf, 
            population_gdf = population_gdf, 
            population_column = population_column,
            container_gdf = container_gdf,
            max_tries = max_tries,
            address_points_gdf = address_points_gdf,
            donut_ratio = donut_ratio,
            distribution = distribution)

        self.target_k = max_k_anonymity


    def _find_radii(self): 

        self.population['pop_area'] = self.population.area

        join = sjoin(self.masked, self.population, how='left')
        
        join['max_area'] = join.apply(
            lambda x: self.target_k * x['pop_area'] / x[self.pop_column], axis=1)

        join['min_area'] = join.apply(
            lambda x: (self.target_k * self.donut_ratio) * x['pop_area'] / 
                x[self.pop_column], axis=1)

        join['max_radius'] = join.apply(
            lambda x: sqrt(x['max_area'] / 3.141592654), axis=1)

        join['min_radius'] = join.apply(
            lambda x: sqrt(x['min_area'] / 3.141592654), axis=1)
        
        self.masked['radius_min'] = join.apply(
            lambda x: x['min_radius'], axis=1)
        self.masked['radius_max'] = join.apply(
            lambda x: x['max_radius'], axis=1)



class Donut_Multiply(Donut):

    def __init__(
                self, 
                sensitive_gdf, 
                max_distance=250,
                population_gdf='', 
                population_column='pop',
                population_multiplier = 0,
                donut_ratio=0.1,
                distribution='uniform',
                container_gdf='',
                address_points_gdf='',
                max_tries=1000):
        
        super().__init__(
            sensitive_gdf = sensitive_gdf, 
            max_distance = max_distance,
            population_gdf = population_gdf, 
            population_column = population_column,
            container_gdf = container_gdf,
            max_tries = max_tries,
            address_points_gdf = address_points_gdf,
            donut_ratio = donut_ratio,
            distribution = distribution,
                )

        self.pop_multiplier = population_multiplier - 1
        

    def _find_radii(self):
        self.population['pop_area'] = self.population.area

        join = sjoin(self.masked, self.population, how='left')
        
        pop_min = min(join[self.pop_column])
        pop_max = max(join[self.pop_column])
        pop_range =  pop_max - pop_min
        
        join['pop_score'] = join.apply(
            lambda x: (1 - (x[self.pop_column] - pop_min) / pop_range) \
                * self.pop_multiplier, axis=1)

        self.masked['radius_max'] = join.apply(
            lambda x: (x['pop_score'] * self.max) + self.max, axis=1)

        self.masked['radius_min'] = self.masked.apply(
            lambda x: x['radius_max'] * self.donut_ratio, axis=1)

## Import data

In [None]:
# nationalities = pd.read_csv('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/CH_Nationality_List_20171130_v1.csv')
# # patients_delta = pd.read_feather('/Users/david/Dropbox/PhD/GitHub/deltagiraph/data/preprocessed/patient_01.01.2019-31.12.2019.feather') 
# geom_delta = from_geofeather('/Users/david/Dropbox/PhD/GitHub/deltagiraph/data/preprocessed/geometries_01.01.2019-31.12.2019.feather')   
# patients_delta = from_geofeather('/Users/david/Dropbox/PhD/GitHub/deltagiraph/data/preprocessed/patient_01.01.2019-31.12.2019.feather')    
# patients_delta['nationality'] = np.random.randint(1, 225, patients_delta.shape[0])
# patients_delta['nationality'] = patients_delta['nationality'].apply(lambda x: nationalities.loc[x])
# patient_gm = patients_delta[patients_delta.insurance == 'Groupe Mutuel']

In [None]:
# lca = pd.read_csv(data_folder/'SMG_RES_LCA_FOR_LINK_ok.csv', sep = ';')

In [None]:
lamal = pd.read_csv(data_folder/'SMG_RECORD_LINKAGE_LAMal_OK.csv', sep = ';')

In [None]:
lamal.head()

In [None]:
# dummy_patient_gm = patient_gm[['lon','lat','sex','address_id','age','nationality','geometry']][patient_gm.lon.isnull()==False].sample(1000).reset_index()
# dummy_patient_gm = dummy_patient_gm.to_crs(2056)

In [None]:
# dummy_patient_gm = pd.merge(dummy_patient_gm,geom_delta[['address','address_id']],on = 'address_id',how = 'left').drop_duplicates()

In [None]:
# dummy_patient_gm = dummy_patient_gm.reset_index(drop = True)

## Prepare geocoding

In [None]:
# champs_address = ['TXRUELEGALE', 'TXRUENUMEROLEGALE', 'TXNPALEGALE', 'TXLOCALITELEGALE', 'TXCOMPLEMENTDESTLEGALE']

In [None]:
def make_gdf(df,crs,x,y):
    geometry = [Point(xy) for xy in zip(df[x], df[y])]
    crs ='epsg:{}'.format(crs)
    gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    return gdf

In [None]:
champ_dict = {'NOANNEE':'NOANNEE','ID_LCA':'ID_LCA','ANNEE_NAISSANCE':'ANNEE_NAISSANCE','mois_mod2':'MOIS_NAISSANCE','CDPHYSSEXE':'SEXE','CDPHYSNATIONALITE':'NATION','TXCOMPLEMENTDESTLEGALE':'COMP_DEST_LEGAL','TXRUELEGALE':'street','TXRUENUMEROLEGALE':'adr_num','TXNPALEGALE':'zipcode','TXLOCALITELEGALE':'city'}
champ_dict_lamal = {'NOANNEE':'NOANNEE','ID_LAMal':'ID_LAMAL','Annee_naiss':'ANNEE_NAISSANCE','mois_mod2':'MOIS_NAISSANCE','CDPHYSSEXE':'SEXE','CDPHYSNATIONALITE':'NATION','TXCOMPLEMENTDESTLEGALE':'COMP_DEST_LEGAL','TXRUELEGALE':'street','TXRUENUMEROLEGALE':'adr_num','TXNPALEGALE':'zipcode','TXLOCALITELEGALE':'city'}

In [None]:
lca.columns = lca.columns.map(champ_dict)

In [None]:
lamal.columns = lamal.columns.map(champ_dict_lamal)

In [None]:
# statpop = pd.read_csv('/Users/david/Dropbox/PhD/Data/Databases/OFS/ag-b-00.03-vz2019statpop/STATPOP2019.csv')
# geometry = [Polygon(zip([xy[0],xy[0],xy[0]+100,xy[0]+100],[xy[1],xy[1]+100,xy[1]+100,xy[1]])) for xy in zip(statpop.E_KOORD, statpop.N_KOORD)]
# statpop_gdf = gpd.GeoDataFrame(statpop, crs=2056, geometry=geometry) 
regbl_address = pd.read_pickle(data_folder/'2020_regbl_address.pkl')
regbl_address[['gkode','gkodn']] = regbl_address[['gkode','gkodn']].astype(float)
regbl_address = make_gdf(regbl_address,'2056','gkode','gkodn')
regbl_address['address'] = regbl_address['address'].str[:-3]

In [None]:
regbl_address = regbl_address[regbl_address.is_valid]

### Caching

In [None]:
def filter_text(text):
    if text is not None:
        for ch in ['\r','\n','.','/']:
            if ch in text:
                text = text.replace(ch,'')
        for ch in ['ã®']:
            if ch in text:
                text = text.replace(ch,'î')
        for ch in ['ã»']:
            if ch in text:
                text = text.replace(ch,'û')
        for ch in ['c/o','\r','\n','c.f.','chez']:
            if ch in text.split(' '):
                text = text.replace(ch," ")
        if 'ch.' in text.split(' '):
            text = text.replace('ch. ',"chemin ")
        elif 'ch' in text.split(' '):
            text = text.replace('ch ',"chemin ")
        elif 'rte.' in text.split(' '):
            text = text.replace('rte. ',"route ")
        elif 'rte' in text.split(' '):
            text = text.replace('rte ',"route ")    
        elif 'av.' in text:
            text = text.replace('av. ',"avenue ")
        elif 'av' in text:
            text = text.replace('av ',"avenue ")
        elif 'bd' in text:
            text = text.replace('bd ',"boulevard ")
        elif 'bd. ' in text:
            text = text.replace('bd. ',"boulevard ")
        elif 'bvd' in text:
            text = text.replace('bvd',"boulevard ")

    return text

In [None]:
cache_file =  data_folder/ "geocoding_cache.pkl"
if cache_file.exists():
    with open(cache_file, 'rb') as file:
        cache = pickle.load(file)
else:
    cache = {}

In [None]:
import os
import sys
import math
from numbers import Number

def _cache_key(row):
    adr_num = row['adr_num']
    if isinstance(adr_num, Number) and math.isnan(adr_num):
        adr_num = None
    return row['zipcode'], adr_num, row['full_address']
def geocoding_cache(row):
    key = _cache_key(row)
    if key in cache:
        return cache[key]
    else:
        output = geocoding(row)
        cache[key] = output
        return output

def run_geocoding_cache(df):
    try:
        return run_geocoding(df)
    finally:
        with open(cache_file, 'wb') as file:
            pickle.dump(cache, file, protocol=pickle.HIGHEST_PROTOCOL)

def run_geocoding(df):
    v0s, v1s, v2s, v3s, v4s, v5s = [], [], [], [], [], []
    for index, row in df.reset_index(drop = True).iterrows():
        if (index + 1) % 500 == 0 or math.remainder(math.log10(index + 1), 1) == 0 or index + 1 == len(df):
            print("geocoding item",index + 1, "of ", len(df))
        v1, v2, v3, v4, v5 = geocoding_cache(row)
        v0s.append(row.full_address)
        v1s.append(v1)
        v2s.append(v2)
        v3s.append(v3)
        v4s.append(v4)
        v5s.append(v5)
    df_result = pd.DataFrame({'full_address': v0s,
                              'similarity': v1s,
                              'new_address': v2s,
                              'E': v3s,
                              'N': v4s,
                              'comment': v5s})
    return df_result
def geocoding(row):
    v1, v2, v3, v4, v5 = geocoding_quick(row)
#     if v3 == 0 or v3 is None or v1 < 90:
#         v1, v2, v3, v4, v5 = geocoding_slow(row)
    return v1, v2, v3, v4, v5
def similar(a, b):  # Returns the percentage of matching sequence between 2 strings
    return SequenceMatcher(None, a, b).ratio()
def geocoding_quick(row):
    max_value = 0
    dict_values = {}
    for id,line in regbl_address[(regbl_address['plz4']==row['zipcode'])&(regbl_address['deinr']==row['adr_num'])].iterrows():
        value = similar(row.full_address,line.address)*100
        if value > max_value:
            max_value = value
            # log.info('Geocoding:\n%s\n%s',row.full_address,line.address)
            dict_values[max_value] = [line.address,line.gkode,line.gkodn,'']
    if max_value == 0:
        dict_values[0] = [None,None,None,'No match']
    max_key = list(max(dict_values.items(), key=lambda k: k[0]))
    similarity,new_address,E,N,comment = max_key[0], max_key[1][0], max_key[1][1], max_key[1][2], max_key[1][3]
    return similarity,new_address,E,N,comment
def geocoding_slow(row):
    max_value = 0
    dict_values = {}
    filtered_df = regbl_address[(regbl_address['deinr'].astype(str)==str(row['adr_num']))&(regbl_address['gdekt']==row['canton'])].sort_values('address')
    if filtered_df.empty:
        filtered_df = regbl_address[(regbl_address['gdekt']==row['canton'])].sort_values('address')
        filtered_df = filtered_df[(filtered_df['deinr'].astype(str)==str(row['adr_num']))|(filtered_df['plz4']==row['zipcode'])|(filtered_df['plz4']==row['zipcode'])]

    for id,line in filtered_df.iterrows():
        value = similar(row.full_address,line.address)*100
        if value > max_value:
            max_value = value
            # log.info('Slow geocoding:\n%s\n%s',row.full_address,line.address)
            dict_values[max_value] = [line.address,line.gkode,line.gkodn,'']
    if max_value == 0:
        dict_values[0] = [None,None,None,'No match']
    max_key = list(max(dict_values.items(), key=lambda k: k[0]))
    similarity,new_address,E,N,comment = max_key[0], max_key[1][0], max_key[1][1], max_key[1][2], max_key[1][3]
    return similarity,new_address,E,N,comment

In [None]:
# if regbl_address_df is None:
#     regbl_address = pd.read_pickle(data_folder/'regbl_address.pkl')
# regbl_address = regbl_address_df

In [None]:
def get_adr_num(street_name):
    "Extract the street number from a street name"
    if street_name is not np.nan:
        try:
            adr_num = [int(s) for s in street_name.split()[-1] if s.isdigit()]
            if len(adr_num) > 0 :
                return street_name.split()[-1]
        except:
            print("invalid street name: %s", street_name)
#             log.warning("invalid street name: %s", street_name)
    else:
        return np.nan

In [None]:
def clean_data(db,no_rue_col,rue_col,ville_col,cp_col,suffix):
    df = db.copy()
    df = df.replace(r'^\s*$', np.nan, regex=True)
    comment_col = 'comment'+ suffix
    new_address = 'new_address'+ suffix
    no_rue = no_rue_col+ suffix
    
    ## Add new columns
    df[comment_col] = ''
    df[new_address] = ''
    
    ##Zipcodes
    #1 Exclude zipcodes outside of CH
    df = df[df[cp_col].astype(str).str.contains('[A-Za-z]', regex= True, na=False) == False]
    df = df[df[cp_col].astype(str).str.contains('[\s]', regex= True, na=False) == False]
    df = df[df[cp_col].astype(str).str.contains('[-]', regex= True, na=False) == False]
    #2 Repare zipcode field : Going from 1201.0 to integer 1201 to '1201'
    df['zipcode'] = (df[cp_col].fillna(-1) #Replace all NAs to -1
    .astype(int) #Convert to integer
    .astype(str) #Convert to string
    .replace('-1',np.nan)) #Convert '-1' to NAs
    df[cp_col] = df[cp_col].astype(str)
    df[cp_col] = df[cp_col].replace('\.','',regex = True)
    
    ##Streets
    df = df[df[rue_col].str.isspace() == False]
    
    ##Address numbers
#     df[no_rue] = df[df[rue_col].isnull()==False].apply(lambda x:get_adr_num(x[rue_col]),axis = 1) #Extract adress number from street column
    df[no_rue] = df[no_rue].fillna(np.nan) #Fill NA
    df[no_rue] = df[no_rue].replace('\.','',regex = True)
    df[no_rue] = df[no_rue].replace('/','').str.strip() #Replace unwanted '/' character with ''
    df[no_rue] = df[no_rue].str.split(' ').str[0] #Get
    df[no_rue] = df[no_rue].str.split('-').str[0] #Get
    df[no_rue]= df[no_rue].str.lstrip('0')
    df[no_rue] = df[no_rue].str.lower()
    ##City
    df[ville_col] = df[ville_col].replace('\.','',regex = True)

    ##Whole
    df[rue_col] = df[rue_col].astype(str)
    # df[no_rue] = df[no_rue].astype(str)
#     df[canton_col] = df[canton_col].astype(str).str.lower()
    df = df.replace(r'^\s*$', np.NaN, regex=True)
    df.loc[df[no_rue] == 'nan', no_rue] = np.nan #Update fields == 'nan' to NaN
    full_address_col = 'full_address'+suffix
    df[full_address_col] = df[rue_col].fillna('').astype(str).str.lower()+ ' ' + df[no_rue].fillna('').astype(str).str.lower() +' '+df[cp_col].fillna('').astype(str)  + ' '+ df[ville_col].astype(str).str.lower()
    df[full_address_col] = df.apply(lambda x: filter_text(x[full_address_col]), axis=1)
    print('******Addresses cleaned successfully******')
    return df

In [None]:
def clean_post_georef(db,df_geom,qc_cutoff,no_rue_col,rue_col,ville_col,cp_col,suffix):
    df = db.copy()
    df.drop(['new_address'],axis = 1,inplace = True)
    df = df.rename(columns = {'gkode':"E",'gkodn':'N'})
    full_address_col = 'full_address'+suffix
    comment_col = 'comment'+ suffix
    comment_col_x = comment_col+'_x'
    comment_col_y = comment_col+'_y'
    E_col_x, E_col_y = 'E_x', 'E_y'
    N_col_x, N_col_y = 'N_x', 'N_y'
    no_rue_col = no_rue_col+suffix
    new_address_col = 'new_address'+suffix
    df = df.merge(df_geom, on=full_address_col, how='left')
    df[comment_col] = df[comment_col_x].fillna('') + df[comment_col_y].fillna('')
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df['E'] = df[E_col_x].fillna(0).astype(float) + df[E_col_y].fillna(0).astype(float)
    df['N'] = df[N_col_x].fillna(0).astype(float) + df[N_col_y].fillna(0).astype(float)
    df.drop([E_col_x, E_col_y, N_col_x, N_col_y, comment_col_x, comment_col_y], axis=1, inplace=True)
    df.loc[df.E == 0,'E'] = np.nan
    df.loc[df.N == 0,'N'] = np.nan
    df.loc[df[comment_col] == 'nan', comment_col] = np.nan
    df.loc[df[comment_col].isnull() == True, comment_col] = 'Ok'
    df.loc[(df.similarity < qc_cutoff) & (df[comment_col].isnull() == False), comment_col] = 'Low quality'
    df.loc[(df.similarity == 0) & (df[comment_col].isnull() == False), comment_col] = 'No match'
    df.loc[df[no_rue_col].isnull(),comment_col] = 'No street number'
    df.loc[df[ville_col].isnull(), comment_col] = 'No address'
    quality_distrib = df[[rue_col,no_rue_col,ville_col,cp_col,full_address_col,new_address_col,comment_col]].groupby(comment_col)[rue_col].count()
    print(quality_distrib)
    return df

## Geocoding LCA

In [None]:
df_geom = clean_data(lca,'adr_num','street','city','zipcode','')

df_geom['geom_id'] = df_geom.index+1

df_geom = pd.merge(df_geom[['ID_LCA','geom_id','street','adr_num','city','zipcode','full_address','new_address','comment']],regbl_address[['deinr','plz4','gdekt','address','gkode','gkodn']],how = 'left', right_on = 'address',left_on = 'full_address').drop('address',axis = 1)

to_geocode = df_geom[(df_geom.adr_num != 'nan')&(df_geom.gkode.isnull())][['full_address','zipcode','adr_num']].drop_duplicates()

geocoded = run_geocoding_cache(to_geocode)

processed_df = clean_post_georef(df_geom,geocoded,85,'adr_num','street','city','zipcode','')

processed_df = make_gdf(processed_df,2056,'E','N')

In [None]:
processed_df.to_csv(data_folder/'Clean_data'/'geocoded_lca_all.csv',index = False)

processed_df_nonull = processed_df[(processed_df.E.isnull()==False) & (processed_df.N.isnull()==False)]

In [None]:
processed_df_nonull.ID_LCA.nunique()

## Geocoding LAMAL

In [None]:
df_geom_lamal = clean_data(lamal,'adr_num','street','city','zipcode','')

In [None]:
df_geom_lamal_in_lca = df_geom_lamal[df_geom_lamal.full_address.isin(df_geom.full_address)]

In [None]:
df_geom_lamal_in_lca = df_geom_lamal_in_lca.reset_index(drop = True)

In [None]:
df_geom_lamal_in_lca['geom_id'] = df_geom_lamal_in_lca.index+1

In [None]:
df_geom_lamal_in_lca = pd.merge(df_geom_lamal_in_lca[['ID_LAMAL','geom_id','street','adr_num','city','zipcode','full_address','new_address','comment']],regbl_address[['deinr','plz4','gdekt','address','gkode','gkodn']],how = 'left', right_on = 'address',left_on = 'full_address').drop('address',axis = 1)

In [None]:
to_geocode_lamal = df_geom_lamal_in_lca[(df_geom_lamal_in_lca.adr_num != 'nan')&(df_geom_lamal_in_lca.gkode.isnull())][['full_address','zipcode','adr_num']].drop_duplicates()

In [None]:
geocoded_lamal = run_geocoding_cache(to_geocode_lamal)

In [None]:
processed_df_lamal = clean_post_georef(df_geom_lamal_in_lca,geocoded_lamal,85,'adr_num','street','city','zipcode','')

In [None]:
processed_df_lamal = make_gdf(processed_df_lamal,2056,'E','N')

In [None]:
processed_df_lamal.to_csv(data_folder/'Clean_data'/'geocoded_lamal_all.csv',index = False)
processed_df_lamal_nonull = processed_df_lamal[(processed_df_lamal.E.isnull()==False) & (processed_df_lamal.N.isnull()==False)]
processed_df_lamal_nonull = processed_df_lamal_nonull[processed_df_lamal_nonull.is_valid]

In [None]:
processed_df_lamal_nonull.ID_LAMAL.nunique()

## Prepare geomasking

### Population-based donut geomasking

In [None]:
communes = gpd.read_file(data_folder/'SHAPEFILE_LV95_LN02'/'swissBOUNDARIES3D_1_3_TLM_HOHEITSGEBIET.shp')
communes = communes[~communes.geometry.isnull()]
communes = communes.rename(columns={'geom': 'geometry'})
communes = communes[communes.NAME != 'Lac Léman (VD)']
communes = communes[communes.NAME != 'Lac de Neuchâtel (VD)']
communes = communes[communes.NAME != 'Lac de Morat (VD)']
communes = communes.reset_index(drop=True)
communes = gpd.GeoDataFrame(communes, crs = 2056,geometry=communes['geometry'])

In [None]:
def convert_3D_2D(geometry):
    '''
    Takes a GeoSeries of 3D Multi/Polygons (has_z) and returns a list of 2D Multi/Polygons
    '''
    new_geo = []
    for p in geometry:
        if p.has_z:
            if p.geom_type == 'Polygon':
                lines = [xy[:2] for xy in list(p.exterior.coords)]
                new_p = Polygon(lines)
                new_geo.append(new_p)
            elif p.geom_type == 'MultiPolygon':
                new_multi_p = []
                for ap in p:
                    lines = [xy[:2] for xy in list(ap.exterior.coords)]
                    new_p = Polygon(lines)
                    new_multi_p.append(new_p)
                new_geo.append(MultiPolygon(new_multi_p))
    return new_geo

communes['geometry'] = convert_3D_2D(communes['geometry'])

communes.crs = 2056

## Geomasking - LCA

In [None]:
processed_df_nonull = processed_df_nonull.drop('geom_id',axis =1).drop_duplicates()

In [None]:
donutmask = Donut(
    sensitive_gdf=processed_df_nonull, # Name of the sensitive geodataframe
    population_gdf=communes, # Name of the census geodataframe
    population_column = 'EINWOHNERZ',
    max_distance=250, # The maximum possible distance that points are displaced
    donut_ratio=0.1, # The ratio used to define the minimum distance points are displaced
    distribution='uniform' # The distribution to use when displacing points. Other options include 'gaussian' and 'areal'. 'Areal' distribution means points are more likely to be displaced further within the range.
) # Optional, a geodataframe used to ensure that points do not leave a particular area. 

donutmask.execute()

masked_gdf = donutmask.displacement_distance()

In [None]:
# donutmask.k_anonymity_actual(address_points_gdf=regbl_address) # Name of the geodataframe including address points. 

In [None]:
masked_gdf['lon_masked'] = masked_gdf['geometry'].to_crs(4326).x
masked_gdf['lat_masked'] = masked_gdf['geometry'].to_crs(4326).y

In [None]:
masked_gdf['address'] = masked_gdf['full_address']
masked_gdf.loc[masked_gdf.new_address.isnull()==False, 'address'] = masked_gdf.new_address
masked_gdf['address'] = pd.Categorical(masked_gdf['address'])
masked_gdf['address_id'] = masked_gdf['address'].cat.codes.astype(int)
masked_gdf['address'] = masked_gdf['address'].astype('string')

In [None]:
#Export
masked_gdf.to_csv(data_folder/'Clean_data'/'masked_lca_nonull.csv',index = False)

In [None]:
#Get rid of NATION duplicates by aggregating, CH if contains CH, otherwise max
agg_nation = lca.groupby('ID_LCA')['NATION'].apply(lambda x: 'CH' if 'CH' in x else max(x))

lca['NATION_NODUP'] = lca['ID_LCA'].map(agg_nation.to_dict())

#Get rid of SEXE duplicates by aggregating, max
agg_sexe = lca.groupby('ID_LCA')['SEXE'].apply(lambda x: max(x))

lca['SEXE_NODUP'] = lca['ID_LCA'].map(agg_sexe.to_dict())

In [None]:
lca_mask_link = pd.merge(lca[['ID_LCA','ANNEE_NAISSANCE','MOIS_NAISSANCE','SEXE_NODUP','NATION_NODUP']].drop_duplicates(),masked_gdf[['ID_LCA','lon_masked','lat_masked','zipcode','address_id']], on = 'ID_LCA')

In [None]:
lca_mask_link= lca_mask_link.sort_values('lon_masked').drop_duplicates(subset= ['ID_LCA','address_id'],keep = 'first')

## Geomasking - LAMAL

In [None]:
processed_df_lamal_nonull = processed_df_lamal_nonull.drop('geom_id',axis =1).drop_duplicates()

In [None]:
import pandas as pd
import os
import sys
import numpy as np
from pathlib import Path

import pandas as pd
import geopandas as gpd
from difflib import SequenceMatcher
from pathlib import Path
from shapely.geometry import Point
import numpy as np
from numbers import Number
import math
import pickle
import uuid
# Import
from pandarallel import pandarallel

In [None]:
donutmask = Donut(
    sensitive_gdf=processed_df_lamal_nonull, # Name of the sensitive geodataframe
    population_gdf=communes, # Name of the census geodataframe
    population_column = 'EINWOHNERZ',
    max_distance=250, # The maximum possible distance that points are displaced
    donut_ratio=0.1, # The ratio used to define the minimum distance points are displaced
    distribution='uniform' # The distribution to use when displacing points. Other options include 'gaussian' and 'areal'. 'Areal' distribution means points are more likely to be displaced further within the range.
) # Optional, a geodataframe used to ensure that points do not leave a particular area. 

donutmask.execute()

masked_gdf_lamal = donutmask.displacement_distance()

In [None]:
# donutmask.k_anonymity_actual(address_points_gdf=regbl_address) # Name of the geodataframe including address points. 

In [None]:
masked_gdf_lamal['lon_masked'] = masked_gdf_lamal['geometry'].to_crs(4326).x
masked_gdf_lamal['lat_masked'] = masked_gdf_lamal['geometry'].to_crs(4326).y

In [None]:
masked_gdf_lamal.to_csv(data_folder/'Clean_data'/'masked_lamal_nonull.csv',index = False)

In [None]:
key_address_id = masked_gdf[['address','address_id']].set_index('address').to_dict()['address_id']

In [None]:
key_address_id

In [None]:
masked_gdf_lamal['address'] = masked_gdf_lamal['full_address']
masked_gdf_lamal.loc[masked_gdf_lamal.new_address.isnull()==False, 'address'] = masked_gdf_lamal.new_address
masked_gdf_lamal['address_id'] = masked_gdf_lamal['address'].map(key_address_id)

In [None]:
lamal['NATION'] = lamal['NATION'].astype(str)

In [None]:
#Get rid of NATION duplicates by aggregating, CH if contains CH, otherwise max
agg_nation = lamal.groupby('ID_LAMAL')['NATION'].apply(lambda x: 'CH' if 'CH' in x else max(x))
lamal['NATION_NODUP'] = lamal['ID_LAMAL'].map(agg_nation.to_dict())
#Get rid of SEXE duplicates by aggregating, max
agg_sexe = lamal.groupby('ID_LAMAL')['SEXE'].apply(lambda x: max(x))
lamal['SEXE_NODUP'] = lamal['ID_LAMAL'].map(agg_sexe.to_dict())

In [None]:
lamal_mask_link = pd.merge(lamal[['ID_LAMAL','ANNEE_NAISSANCE','MOIS_NAISSANCE','SEXE_NODUP','NATION_NODUP']].drop_duplicates(),masked_gdf_lamal[['ID_LAMAL','lon_masked','lat_masked','zipcode','address_id']], on = 'ID_LAMAL')

In [None]:
lamal_mask_link = lamal_mask_link.sort_values('lon_masked').drop_duplicates(subset= ['ID_LAMAL','address_id'],keep = 'first')

In [None]:
lamal.ID_LAMAL.nunique()

In [None]:
lca_mask_link.ID_LCA.nunique()

In [None]:
lca_mask_link.to_csv(data_folder/'Clean_data'/'lca_masked_for_linkage.csv',index = False)

In [None]:
lamal_mask_link.to_csv(data_folder/'Clean_data'/'lamal_masked_for_linkage.csv',index = False)

In [None]:
lamal_mask_link.ID_LAMAL.nunique()

### Street-based geomasking

Much too slow to implement for our purpose (~ 100,000 addresses) 

## Prepare record linkage

In [None]:
indexer = rl.Index()
#champ_dict = {'NOANNEE':'NOANNEE','ID_LCA':'ID_LCA','ANNEE_NAISSANCE':'ANNEE_NAISSANCE','mois_mod2':'MOIS_NAISSANCE','CDPHYSSEXE':'SEXE','CDPHYSNATIONALITE':'NATION','TXCOMPLEMENTDESTLEGALE':'COMP_DEST_LEGAL','TXRUELEGALE':'street','TXRUENUMEROLEGALE':'adr_num','TXNPALEGALE':'zipcode','TXLOCALITELEGALE':'city'}
indexer.block(['ANNEE_NAISSANCE','MOIS_NAISSANCE','SEXE_NODUP','NATION_NODUP','zipcode'])
pairs = indexer.index(lca_mask_link, lamal_mask_link)

In [None]:
print (len(lca_mask_link), len(lamal_mask_link), len(pairs))

In [None]:
n_cpu = 6 #Set number of CPUs

In [None]:
comparer = rl.Compare(n_jobs=n_cpu)

In [None]:
#250m (max geomasking) - 25min (min) / 2 (0.5 decay of the linear fct)
#Because it is the min distance of geomasking
# comparer.exact('given_name', 'given_name', label='given_name')
# comparer.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
# comparer.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
comparer.string('NATION_NODUP', 'NATION_NODUP',method='jarowinkler', threshold=0.85, label='NATION')
comparer.exact('ANNEE_NAISSANCE', 'ANNEE_NAISSANCE', label='ANNEE_NAISSANCE')
comparer.exact('MOIS_NAISSANCE', 'MOIS_NAISSANCE', label='MOIS_NAISSANCE')
comparer.exact('SEXE_NODUP', 'SEXE_NODUP', label='SEXE')
comparer.exact('zipcode', 'zipcode', label='zipcode')
comparer.geo(left_on_lat = 'lat_masked',left_on_lng = 'lon_masked',right_on_lat = 'lat_masked',right_on_lng = 'lon_masked',scale = 0.1, offset = 0.5, method = 'exp',label = 'distance')
features = comparer.compute(pairs, lca_mask_link, lamal_mask_link)

In [None]:
features.describe()

In [None]:
# Sum the comparison results.
features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
features[features.distance.between(0.01,0.999)]

In [None]:
# features[features.distance.between(0.5,0.999)]

In [None]:
cl = rl.ECMClassifier(binarize=0.5)
cl.fit(features)

In [None]:
# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(features)
print("Predicted number of links:", len(links_pred))

In [None]:
# Predict the match probability for each pair in the dataset.
probs = cl.prob(features)
print(probs)

In [None]:
features.to_csv(data_folder/'Clean_data'/'features_w_zipcode.csv')

In [None]:
probs.to_csv(data_folder/'Clean_data'/'probs_w_zipcode.csv')

In [None]:
df_probs = pd.DataFrame(probs).reset_index()

In [None]:
df_probs.columns = ['id_lca','id_lamal','prob']

In [None]:
key_id_index_lamal = lamal_mask_link['ID_LAMAL'].astype(int).to_dict()
key_id_index_lca = lca_mask_link['ID_LCA'].astype(int).to_dict()

In [None]:
df_probs['id_lca'] = df_probs['id_lca'].astype(int).map(key_id_index_lca)
df_probs['id_lamal'] = df_probs['id_lamal'].astype(int).map(key_id_index_lamal)

In [None]:
df_probs['id'] = df_probs['id_lca'].astype(str) + '-'+ df_probs['id_lamal'].astype(str)

In [None]:
key_probs = df_probs.set_index('id')['prob'].to_dict()

In [None]:
max_probs = df_probs.set_index('id_lamal').groupby(['id_lca'])['prob'].idxmax()

In [None]:
max_probs_df = pd.DataFrame(max_probs).reset_index()
max_probs_df.columns = ['id_lca','id_lamal']

In [None]:
max_probs_df['id'] = max_probs_df['id_lca'].astype(str) + '-'+ max_probs_df['id_lamal'].astype(str)

In [None]:
max_probs_df['prob'] = max_probs_df['id'].map(key_probs)

In [None]:
max_probs_df[['id_lca']].to_csv(data_folder/'Clean_data'/'lca_list_pour_christophe.csv')
max_probs_df[['id_lamal']].to_csv(data_folder/'Clean_data'/'lamal_list_pour_christophe.csv')

## Export end file

In [None]:
max_probs_df.to_csv(data_folder/'Clean_data'/'max_probs_w_zipcode.csv')