In [19]:
import numpy as np
import pandas as pd
import os
import re
from pathlib import Path
import difflib
from collections import Counter
#from metaphone import doublemetaphone
from recordlinkage.preprocessing import clean

ROOT = '/home/thabib/database_records_linkage'

In [26]:
s1_cstr = pd.read_csv(os.path.join(ROOT, 'data', 'source1_cstr.csv'))
s2_cstr = pd.read_csv(os.path.join(ROOT, 'data', 'source2_cstr.csv'))

In [28]:
display(s1_cstr)
display(s2_cstr)

Unnamed: 0,id,name,street_number,street_type,street_name,address_line2,postal_code,city
0,542012000015,societe des sucreries du marquenterre,,rue,de la fontaine,,80120,rue
1,542012000031,societe des sucreries du marquenterre,,rte,nationale,,62140,marconnelle
2,545011900028,le grand cafe et gassendi,,,saint christophe,route de marseille,4000,digne les bains
3,552017600016,hernas cartonnage,50.0,rue,pasteur,,80210,feuquieres en vimeu
4,552017600032,hernas cartonnage,,,zone des hayettes,,80210,chepy
...,...,...,...,...,...,...,...,...
1826164,90771119600010,bionormande,,rue,de la grande flandre,lot 1 7 a,76270,neufchatel en bray
1826165,90771120400012,ahela bat,18.0,rue,saint clement,,93200,saint denis
1826166,90771121200015,deco batiment,7.0,rue,auguste blanqui,,93200,saint denis
1826167,90760906900014,chateaux merveilles,15.0,rue,de milan,,75009,paris 9


Unnamed: 0,address,website,id,name
0,226 r menthon 01380 saint cyr sur menthon,,,sandrine berny
1,50 rte bage 01380 saint cyr sur menthon,,8.412263e+13,sarl landrix freres
2,120 r menthon 01380 saint cyr sur menthon,,7.913458e+13,marion kwasniewski
3,moulin 28 chem ruillat 01380 saint cyr sur men...,,,claude fondation
4,350 rte loeze 01380 saint cyr sur menthon,,4.341618e+13,manigand stephane
...,...,...,...,...
3530297,,,8.497810e+13,jacoby o
3530298,quart guardere 83150 bandol,http://agence.carrefourlocation.fr,,carrefour location
3530299,5 r jardins 83150 bandol,http://aquabike-en-mer.com,7.951850e+13,calvet richard
3530300,54 traverse laboureur 83150 bandol,http://dolcefarnientebandol.com,5.392586e+13,dolce far niente


# Using Recordlinkage

see here: https://recordlinkage.readthedocs.io/en/latest/about.html & https://pbpython.com/record-linking.html

### Parsing 'address' in source2 into separate columns

In [31]:
from postal.parser import parse_address

# 'city' – city
# 'postcode' – psotal_code
# 'house_number' – street_number
# 'road' – street_name
# 'house' OR anything else – address_line2

In [30]:
unique_street_types = np.unique(s1_cstr['street_type'].dropna())
unique_street_types

array(['all', 'av', 'bd', 'cami', 'car', 'che', 'chem', 'chs', 'cite',
       'clos', 'cor', 'cote', 'cour', 'crs', 'dom', 'dsc', 'eca', 'esp',
       'fg', 'gr', 'ham', 'hle', 'imp', 'ld', 'lot', 'mar', 'mte', 'parc',
       'pas', 'pl', 'plan', 'pln', 'plt', 'pont', 'port', 'pro', 'prv',
       'qua', 'quai', 'res', 'rle', 'roc', 'rpt', 'rte', 'rue', 'sen',
       'sq', 'tpl', 'tra', 'vla', 'vlge', 'voie', 'za', 'zac', 'zi',
       'zone'], dtype=object)

In [85]:
parse_address('22 Bis r Nationale, 45190 BEAUGENCY')

[('22 bis', 'house_number'),
 ('r nationale', 'road'),
 ('45190', 'postcode'),
 ('beaugency', 'city')]

In [87]:
parse_address('Parking Station De Lavage 505 av Georges Pompidou, 73200 GILLY SUR ISÈRE')

[('parking station de lavage', 'house'),
 ('505', 'house_number'),
 ('av georges pompidou', 'road'),
 ('73200', 'postcode'),
 ('gilly sur isère', 'city')]

In [4]:
parse_address('4 BD DE LA REPUBLIQUE,80100 ABBEVILLE')

[('4', 'house_number'),
 ('bd de la republique', 'road'),
 ('80100', 'postcode'),
 ('abbeville', 'city')]

In [5]:
parse_address('Zac De Cadréan bd Cadréan, 44550 MONTOIR DE BRETAGNE')

[('zac de cadréan', 'house'),
 ('bd cadréan', 'road'),
 ('44550', 'postcode'),
 ('montoir de bretagne', 'city')]

In [32]:
parse_address('zi Louis Delaporte, 76370 ROUXMESNIL BOUTEILLES')

[('zi louis delaporte', 'road'),
 ('76370', 'postcode'),
 ('rouxmesnil bouteilles', 'city')]

In [33]:
def create_address_fields_dataframe(full_address_series=None):
    
    df_address_fields = pd.DataFrame(columns=['street_number', 'street_type', 'street_name', 'address_line2',
                                              'postal_code', 'city'])
    
    for idx, add in enumerate(full_address_series):
        if idx%10000 == 0:
            print('{} entries completed...'.format(idx))
        str_num = str_type = str_name = addline2 = postalcode = city = np.nan
        
        if type(add) == float:
            pass
        else:
            addline2 = ''
            parsed_add = parse_address(add)
            
            for item in parsed_add:
                if item[1] == 'city':
                    city = item[0]
                elif item[1] == 'postcode':
                    postalcode = item[0]
                elif item[1] == 'road':
                    road_comps = item[0].split()
                    str_name = ' '.join(road_comps[1:])
                    if road_comps[0] == 'r':
                        str_type = 'rue'
                    elif road_comps[0] in unique_street_types:
                        str_type = road_comps[0]
                    else:
                        str_name = item[0]
                elif item[1] == 'house_number':
                    try:
                        str_num = int(item[0])
                    except ValueError:
                        #print('The parsed \'house_number\' is not a valid integer. Extracting the first integer '\
                         #     'occurrence (from string) and setting it as \'street_number\'')
                        ints = re.findall(r'\d+', item[0])
                        if len(ints) == 0:
                            str_num = np.nan
                        else:
                            sep_ints = [int(s) for s in item[0].split() if s.isdigit()]
                            if len(sep_ints) != 0:
                                str_num = sep_ints[0]
                            else:
                                str_num = ints[0]
                else:
                    addline2 += item[0]
                    addline2 += ' '
            if addline2 == '':
                addline2 = np.nan
            
        df_address_fields.loc[idx] = [str_num, str_type, str_name, addline2, postalcode, city]
        
    return df_address_fields

In [None]:
df_address_fields = create_address_fields_dataframe(s2_nond_cstr['address'])
df_address_fields

0 entries completed...
10000 entries completed...
20000 entries completed...
30000 entries completed...
40000 entries completed...
50000 entries completed...
60000 entries completed...
70000 entries completed...
80000 entries completed...
90000 entries completed...
100000 entries completed...
110000 entries completed...
120000 entries completed...
130000 entries completed...
140000 entries completed...
150000 entries completed...
160000 entries completed...
170000 entries completed...
180000 entries completed...
190000 entries completed...
200000 entries completed...
210000 entries completed...
220000 entries completed...
230000 entries completed...
240000 entries completed...


### Matching rows using recordlinkage