In [1]:
import pandas as pd
import recordlinkage
df_with_truth = pd.read_csv('restaurant.csv', skip_blank_lines=True)
df_with_truth.head(9)

Unnamed: 0,name,addr,city,phone,type,cluster
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses,0
2,arnie morton,435 s. la cienega boulevard,los angeles,310-246-1501,steakhouses,0
3,art's delicatessen,12224 ventura blvd.,studio city,818/762-1221,american,1
4,art's deli,12224 ventura blvd.,studio city,818-762-1221,delis,1
5,art's deli,12224 ventura blvd.,los angeles,818-762-1221,delis,1
6,hotel bel-air,701 stone canyon rd.,bel air,310/472-1211,californian,2
7,bel-air hotel,701 stone canyon rd.,bel air,310-472-1211,californian,2
8,bel-air,701 stone canyon road,bel air,(310) 472-1211,american,2


In [2]:
df = df_with_truth.drop(columns=['cluster', 'phone','type'])
df.head(2)

Unnamed: 0,name,addr,city
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles


In [3]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
name    881 non-null object
addr    881 non-null object
city    881 non-null object
dtypes: object(3)
memory usage: 20.7+ KB


In [4]:
df = df.fillna('')

In [5]:
import re
print("Remove irrelevant separators:")
irrelevant_regex = re.compile(r'[^a-z0-9\s]')
print("Remove multi-spaces:")
multispace_regex = re.compile(r'\s\s+')
def assign_no_symbols_name(df):
    return df.assign(
        name=df['name']
             .str.replace(irrelevant_regex, ' ')
             .str.replace(multispace_regex, ' '))

df = assign_no_symbols_name(df)
df.head(9)


Remove irrelevant separators:
Remove multi-spaces:


Unnamed: 0,name,addr,city
0,arnie morton s of chicago,435 s. la cienega blv.,los angeles
1,arnie morton s of chicago,435 s. la cienega blvd.,los angeles
2,arnie morton,435 s. la cienega boulevard,los angeles
3,art s delicatessen,12224 ventura blvd.,studio city
4,art s deli,12224 ventura blvd.,studio city
5,art s deli,12224 ventura blvd.,los angeles
6,hotel bel air,701 stone canyon rd.,bel air
7,bel air hotel,701 stone canyon rd.,bel air
8,bel air,701 stone canyon road,bel air


In [6]:
indexer = recordlinkage.FullIndex()
pairs = indexer.index(df)



In [7]:
len(pairs)

387640

In [8]:
# indexer = recordlinkage.BlockIndex(on='type')
# pairs = indexer.index(df)

In [9]:
len(pairs)

387640

In [10]:
pairs.to_frame()[:10].values

array([[1, 0],
       [2, 0],
       [2, 1],
       [3, 0],
       [3, 1],
       [3, 2],
       [4, 0],
       [4, 1],
       [4, 2],
       [4, 3]], dtype=int64)

In [11]:
compare_cl = recordlinkage.Compare()

In [12]:
compare_cl.string('name', 'name', method='jarowinkler',label='name')
compare_cl.string('city', 'city', method='jarowinkler',label='city')
compare_cl.string('addr', 'addr', method='jarowinkler',label='addr')

<Compare>

In [13]:
comparision_vectors = compare_cl.compute(pairs, df)
comparision_vectors.head(5)

Unnamed: 0,Unnamed: 1,name,city,addr
1,0,1.0,1.0,0.991304
2,0,0.896,1.0,0.946465
2,1,0.896,1.0,0.954267
3,0,0.562492,0.40404,0.59362
3,1,0.562492,0.40404,0.62513


In [14]:
mask1=comparision_vectors['name']>=0.9
mask2=comparision_vectors['addr']>=0.9
mask3=comparision_vectors['city']>=0.9
match9=comparision_vectors[mask1 & mask2 & mask3]
match9.head()
# print(match9)


Unnamed: 0,Unnamed: 1,name,city,addr
1,0,1.0,1.0,0.991304
4,3,0.911111,1.0,1.0
8,7,0.907692,1.0,0.970952
10,9,1.0,1.0,1.0
11,9,1.0,1.0,1.0


In [15]:
match_records=set(match9.index)

In [16]:
print(f"Match Records on 0.9: ")
for i in match_records:
    display(df.iloc[list(i)][['name','addr','city']])

Match Records on 0.9: 


Unnamed: 0,name,addr,city
50,pinot bistro,12969 ventura boulevard,studio city
49,pinot bistro,12969 ventura blvd.,studio city


Unnamed: 0,name,addr,city
66,cafe lalo,201 w. 83rd st.,new york city
65,cafe lalo,201 w. 83rd st.,new york


Unnamed: 0,name,addr,city
206,alain rondelli,126 clement st.,san francisco
205,alain rondelli,126 clement st.,san francisco


Unnamed: 0,name,addr,city
224,khan toke thai house,5937 geary blvd.,san francisco
223,khan toke thai house,5937 geary blvd.,san francisco


Unnamed: 0,name,addr,city
90,island spice,402 w. 44th st.,new york city
89,island spice,402 w. 44th st.,new york


Unnamed: 0,name,addr,city
218,fleur de lys,777 sutter st.,san francisco
217,fleur de lys,777 sutter st.,san francisco


Unnamed: 0,name,addr,city
85,gotham bar and grill,12 e 12th st,new york city
84,gotham bar grill,12 e. 12th st.,new york city


Unnamed: 0,name,addr,city
214,campton place,340 stockton st.,san francisco
213,campton place,340 stockton st.,san francisco


Unnamed: 0,name,addr,city
82,four seasons,99 e. 52nd st.,new york city
81,four seasons grill room,99 e. 52nd st.,new york


Unnamed: 0,name,addr,city
98,le bernardin,155 w. 51st st.,new york city
97,le bernardin,155 w. 51st st.,new york


Unnamed: 0,name,addr,city
210,boulevard,1 mission st.,san francisco
209,boulevard,1 mission st.,san francisco


Unnamed: 0,name,addr,city
226,la folie,2316 polk st.,san francisco
225,la folie,2316 polk st.,san francisco


Unnamed: 0,name,addr,city
12,cafe bizou,14016 ventura blvd,sherman oaks
11,cafe bizou,14016 ventura blvd.,sherman oaks


Unnamed: 0,name,addr,city
236,postrio,545 post st.,san francisco
235,postrio,545 post st.,san francisco


Unnamed: 0,name,addr,city
153,coyote cafe las vegas,3799 las vegas blvd. s.,las vegas
152,coyote cafe,3799 las vegas blvd. s,las vegas


Unnamed: 0,name,addr,city
194,pano s paul s,1232 west paces ferry rd nw,atlanta
192,pano s and paul s,1232 w. paces ferry rd.,atlanta


Unnamed: 0,name,addr,city
56,valentino,3115 pico blvd.,santa monica
55,valentino,3115 pico blvd.,santa monica


Unnamed: 0,name,addr,city
157,palace court,3570 las vegas blvd. s.,las vegas
156,palace court,3570 las vegas blvd. s,las vegas


Unnamed: 0,name,addr,city
103,lespinasse,2 e 55th st,new york
101,lespinasse,2 e. 55th st.,new york


Unnamed: 0,name,addr,city
161,steak house the,2880 las vegas blvd. s.,las vegas
160,steak house,2880 las vegas blvd. s,las vegas


Unnamed: 0,name,addr,city
11,cafe bizou,14016 ventura blvd.,sherman oaks
10,cafe bizou,14016 ventura blvd.,sherman oaks


Unnamed: 0,name,addr,city
19,citrus,6703 melrose avenue,los angeles
18,citrus,6703 melrose ave.,los angeles


Unnamed: 0,name,addr,city
220,fringale,570 fourth st.,san francisco
219,fringale,570 4th st.,san francisco


Unnamed: 0,name,addr,city
80,felidia,243 e. 58th st.,new york city
79,felidia,243 e. 58th st.,new york


Unnamed: 0,name,addr,city
10,cafe bizou,14016 ventura blvd.,sherman oaks
9,cafe bizou,14016 ventura blvd.,sherman oaks


Unnamed: 0,name,addr,city
85,gotham bar and grill,12 e 12th st,new york city
83,gotham bar grill,12 e. 12th st.,new york


Unnamed: 0,name,addr,city
123,picholine,35 w. 64th st.,new york city
122,picholine,35 w. 64th st.,new york


Unnamed: 0,name,addr,city
763,palm too,840 second ave.,new york city
762,palm,837 second ave.,new york city


Unnamed: 0,name,addr,city
11,cafe bizou,14016 ventura blvd.,sherman oaks
9,cafe bizou,14016 ventura blvd.,sherman oaks


Unnamed: 0,name,addr,city
19,citrus,6703 melrose avenue,los angeles
17,citrus,6703 melrose ave.,los angeles


Unnamed: 0,name,addr,city
1,arnie morton s of chicago,435 s. la cienega blvd.,los angeles
0,arnie morton s of chicago,435 s. la cienega blv.,los angeles


Unnamed: 0,name,addr,city
127,rainbow room,30 rockefeller plaza,new york city
126,rainbow room,30 rockefeller plaza,new york


Unnamed: 0,name,addr,city
39,matsuhisa,129 n. la cienega blvd.,beverly hills
38,matsuhisa,129 n. la cienega blvd.,beverly hills


Unnamed: 0,name,addr,city
240,rose pistola,532 columbus ave.,san francisco
239,rose pistola,532 columbus ave.,san francisco


Unnamed: 0,name,addr,city
839,ritz carlton cafe atlanta,181 peachtree st.,atlanta
200,ritz carlton restaurant,181 peachtree st.,atlanta


Unnamed: 0,name,addr,city
155,le montrachet bistro,3000 paradise rd.,las vegas
154,le montrachet,3000 w. paradise rd.,las vegas


Unnamed: 0,name,addr,city
163,tillerman the,2245 e. flamingo rd.,las vegas
162,tillerman,2245 e. flamingo rd.,las vegas


Unnamed: 0,name,addr,city
119,park avenue cafe new york city,100 e. 63rd st.,new york city
118,park avenue cafe,100 e. 63rd st.,new york


Unnamed: 0,name,addr,city
62,aquavit,13 w. 54th st.,new york city
61,aquavit,13 w. 54th st.,new york


Unnamed: 0,name,addr,city
78,dawat,210 e. 58th st.,new york city
77,dawat,210 e. 58th st.,new york


Unnamed: 0,name,addr,city
194,pano s paul s,1232 west paces ferry rd nw,atlanta
193,pano s paul s,1232 w. paces ferry rd.,atlanta


Unnamed: 0,name,addr,city
159,second street grill,200 e. fremont st.,las vegas
158,second street grille,200 e. fremont st.,las vegas


Unnamed: 0,name,addr,city
12,cafe bizou,14016 ventura blvd,sherman oaks
9,cafe bizou,14016 ventura blvd.,sherman oaks


Unnamed: 0,name,addr,city
230,masa s,648 bush st.,san francisco
229,masa s,648 bush st.,san francisco


Unnamed: 0,name,addr,city
107,manhattan ocean club,57 w. 58th st.,new york city
106,manhattan ocean club,57 w. 58th st.,new york


Unnamed: 0,name,addr,city
121,petrossian,182 w. 58th st.,new york city
120,petrossian,182 w. 58th st.,new york


Unnamed: 0,name,addr,city
135,seryna,11 e. 53rd st.,new york city
134,seryna,11 e. 53rd st.,new york


Unnamed: 0,name,addr,city
185,indigo coastal grill,1397 n. highland ave.,atlanta
184,indigo coastal grill,1397 n. highland ave.,atlanta


Unnamed: 0,name,addr,city
202,toulouse,293-b peachtree rd.,atlanta
201,toulouse,b peachtree rd.,atlanta


Unnamed: 0,name,addr,city
131,san domenico,240 central park s.,new york city
130,san domenico,240 central park s,new york


Unnamed: 0,name,addr,city
47,philippe the original,1001 north alameda,los angeles
45,philippe s the original,1001 n. alameda st.,los angeles


Unnamed: 0,name,addr,city
189,mary mac s tea room,224 ponce de leon ave.,atlanta
188,mary mac s tea room,224 ponce de leon ave.,atlanta


Unnamed: 0,name,addr,city
18,citrus,6703 melrose ave.,los angeles
17,citrus,6703 melrose ave.,los angeles


Unnamed: 0,name,addr,city
94,la caravelle,33 w. 55th st.,new york city
93,la caravelle,33 w. 55th st.,new york


Unnamed: 0,name,addr,city
222,hawthorne lane,22 hawthorne st.,san francisco
221,hawthorne lane,22 hawthorne st.,san francisco


Unnamed: 0,name,addr,city
8,bel air,701 stone canyon road,bel air
7,bel air hotel,701 stone canyon rd.,bel air


Unnamed: 0,name,addr,city
177,delectables,1 margaret mitchell sq.,atlanta
176,delectables,1 margaret mitchell sq.,atlanta


Unnamed: 0,name,addr,city
25,granita,23725 w. malibu rd.,malibu
24,granita,23725 w. malibu rd.,malibu


Unnamed: 0,name,addr,city
60,21 club,21 w. 52nd st.,new york city
59,21 club,21 w. 52nd st.,new york


Unnamed: 0,name,addr,city
68,cafe des artistes,1 w. 67th st.,new york city
67,cafe des artistes,1 w. 67th st.,new york


Unnamed: 0,name,addr,city
105,lutece,249 e. 50th st.,new york city
104,lutece,249 e. 50th st.,new york


Unnamed: 0,name,addr,city
151,chin s,3200 las vegas blvd. s.,las vegas
150,chin s,3200 las vegas blvd. s,las vegas


Unnamed: 0,name,addr,city
14,campanile,624 s. la brea ave.,los angeles
13,campanile,624 s. la brea ave.,los angeles


Unnamed: 0,name,addr,city
64,aureole,34 e. 61st st.,new york city
63,aureole,34 e. 61st st.,new york


Unnamed: 0,name,addr,city
33,le chardonnay los angeles,8284 melrose ave.,los angeles
32,le chardonnay,8284 melrose ave.,los angeles


Unnamed: 0,name,addr,city
109,march,405 e. 58th st.,new york city
108,march,405 e. 58th st.,new york


Unnamed: 0,name,addr,city
117,oceana,55 e. 54th st.,new york city
116,oceana,55 e. 54th st.,new york


Unnamed: 0,name,addr,city
147,union square cafe,21 e. 16th st.,new york city
146,union square cafe,21 e. 16th st.,new york


Unnamed: 0,name,addr,city
173,buckhead diner,3073 piedmont rd.,atlanta
172,buckhead diner,3073 piedmont road,atlanta


Unnamed: 0,name,addr,city
181,hedgerose heights inn the,490 e. paces ferry rd. ne,atlanta
180,hedgerose heights inn,490 e. paces ferry rd.,atlanta


Unnamed: 0,name,addr,city
16,chinois on main,2709 main st.,santa monica
15,chinois on main,2709 main st.,santa monica


Unnamed: 0,name,addr,city
92,jo jo,160 e. 64th st.,new york city
91,jo jo,160 e. 64th st.,new york


Unnamed: 0,name,addr,city
208,aqua,252 california st.,san francisco
207,aqua,252 california st.,san francisco


Unnamed: 0,name,addr,city
12,cafe bizou,14016 ventura blvd,sherman oaks
10,cafe bizou,14016 ventura blvd.,sherman oaks


Unnamed: 0,name,addr,city
216,chez michel,804 north point st.,san francisco
215,chez michel,804 northpoint,san francisco


Unnamed: 0,name,addr,city
234,plumpjack cafe,3127 fillmore st.,san francisco
233,plumpjack cafe,3201 fillmore st.,san francisco


Unnamed: 0,name,addr,city
204,veni vidi vici,41 14th st.,atlanta
203,veni vidi vici,41 14th st.,atlanta


Unnamed: 0,name,addr,city
44,patina,5955 melrose ave.,los angeles
43,patina,5955 melrose ave.,los angeles


Unnamed: 0,name,addr,city
52,rex il ristorante,617 s. olive st.,los angeles
51,rex il ristorante,617 s. olive st.,los angeles


Unnamed: 0,name,addr,city
76,daniel,20 e. 76th st.,new york city
75,daniel,20 e. 76th st.,new york


Unnamed: 0,name,addr,city
84,gotham bar grill,12 e. 12th st.,new york city
83,gotham bar grill,12 e. 12th st.,new york


Unnamed: 0,name,addr,city
191,nikolai s roof,255 courtland st.,atlanta
190,nikolai s roof,255 courtland st. at harris st.,atlanta


Unnamed: 0,name,addr,city
193,pano s paul s,1232 w. paces ferry rd.,atlanta
192,pano s and paul s,1232 w. paces ferry rd.,atlanta


Unnamed: 0,name,addr,city
212,cafe claude,7 claude ln.,san francisco
211,cafe claude,7 claude la.,san francisco


Unnamed: 0,name,addr,city
4,art s deli,12224 ventura blvd.,studio city
3,art s delicatessen,12224 ventura blvd.,studio city


# Classification 
Threshold Based Classification
A simple way to classify comparision vectors as matches or nonmatches is to compute a weighted average over the vector to get a score:

In [23]:
import numpy as np

In [24]:
scores=np.average(comparision_vectors.values,axis=1)
scored_comparision_vectors=comparision_vectors.assign(score=scores)
scored_comparision_vectors.head()


Unnamed: 0,Unnamed: 1,name,city,addr,score
1,0,1.0,1.0,0.991304,0.997101
2,0,0.896,1.0,0.946465,0.947488
2,1,0.896,1.0,0.954267,0.950089
3,0,0.562492,0.40404,0.59362,0.520051
3,1,0.562492,0.40404,0.62513,0.530554


In [25]:
matches=scored_comparision_vectors[scored_comparision_vectors['score']>=0.9]

In [49]:
ud={}
unique=[]
duplicate=[]
for i in set(matches.index):
    if (list(i)[0]) not in ud:
        unique.append((list(i)[0]))
        ud=list(ud)+unique
                    
    if (list(i)[1]) not in ud:
        duplicate.append((list(i)[1]))
        ud=list(ud)+duplicate
    ud=set(ud)
unique=np.array(unique)
unique.sort()
print(f"Unique Records index:",unique)
duplicate=np.array(duplicate)
duplicate.sort()
print(f"Duplicate Records index:",duplicate)  
ud=list(ud)

Unique Records index: [  2   4   8  12  14  16  19  21  23  25  33  39  44  47  50  52  56  60
  62  64  66  68  70  74  76  78  80  82  85  86  88  90  92  94  96  98
 103 105 107 109 113 115 117 119 121 123 125 127 129 131 135 147 149 151
 153 155 157 159 161 163 165 167 169 171 173 175 177 179 181 183 185 187
 189 191 194 197 198 200 202 204 206 208 210 212 214 216 218 220 222 224
 226 228 230 232 234 236 238 240 415 419 763 794 839]
Duplicate Records index: [  0   1   3   7   9  10  11  13  15  17  18  20  22  24  32  38  43  45
  49  51  55  59  61  63  65  67  69  73  75  77  79  81  83  84  87  89
  91  93  95  97 101 102 104 106 108 112 114 116 118 120 122 124 126 128
 130 134 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176
 178 180 182 184 186 188 190 192 193 195 196 199 201 203 205 207 209 211
 213 215 217 219 221 223 225 227 229 231 233 235 237 239 414 418 762 793]


In [50]:
print(len(unique))
print(len(duplicate))
print(len(ud))

103
108
211


In [78]:
# print(ud)

In [75]:
unique1=set(unique)
unique2=set(df.index)-set(ud)
unique=list(unique1)+list(unique2)

In [79]:
df.iloc[unique]

Unnamed: 0,name,addr,city
2,arnie morton,435 s. la cienega boulevard,los angeles
4,art s deli,12224 ventura blvd.,studio city
8,bel air,701 stone canyon road,bel air
12,cafe bizou,14016 ventura blvd,sherman oaks
14,campanile,624 s. la brea ave.,los angeles
16,chinois on main,2709 main st.,santa monica
19,citrus,6703 melrose avenue,los angeles
21,fenix,8358 sunset blvd.,w. hollywood
23,fenix at the argyle,8358 sunset blvd. west,hollywood
25,granita,23725 w. malibu rd.,malibu


In [81]:
df.iloc[duplicate]

Unnamed: 0,name,addr,city
0,arnie morton s of chicago,435 s. la cienega blv.,los angeles
1,arnie morton s of chicago,435 s. la cienega blvd.,los angeles
3,art s delicatessen,12224 ventura blvd.,studio city
7,bel air hotel,701 stone canyon rd.,bel air
9,cafe bizou,14016 ventura blvd.,sherman oaks
10,cafe bizou,14016 ventura blvd.,sherman oaks
11,cafe bizou,14016 ventura blvd.,sherman oaks
13,campanile,624 s. la brea ave.,los angeles
15,chinois on main,2709 main st.,santa monica
17,citrus,6703 melrose ave.,los angeles


In [82]:
108+773

881