## Fuzzy match 

## autor: Roberto Mendoza 

In [1]:
# !pip install fuzzywuzzy
# !pip install python-Levenshtein

In [2]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
import numpy as np
import pandas as pd
import swifter
import unidecode # to drop tildes
import itertools

In [3]:
# Comparamos nombres 
# Caso 1

name1 = "Juan Pablo Villanueva Melcochita"
name2 = "juan pablo! villanueva 5 melcochita.."

print(name1 is name2)
print(name1 == name2)

False
False


In [4]:
fuzz.ratio(name1,name2)

81

In [5]:
# Se necesita en pre-procesamiento

fuzz.ratio(name1.lower(), re.sub('[^a-zA-Z\\s]', '',name2).lower() )

98

In [6]:
print(name1.lower())
print( re.sub('[^a-zA-Z\\s]', '',name2).lower() )

print(name1.lower() is re.sub('[^a-zA-Z\\s]', '', name2).lower())

juan pablo villanueva melcochita
juan pablo villanueva  melcochita
False


In [7]:
# Caso 2: ausencia de palabras

name1 = "Juan Pablo Villanueva Melcochita"
name2 = "Juan melcochita"

In [8]:
print(fuzz.ratio(name1,name2))

# partial_ratio analiza ambos string y elige el string de menor longitud para compararlo con el otro string
# En efecto, partial_ratio elige name2 y, a aprtir de ello, se compara con name1
print(fuzz.partial_ratio(name1.lower(),name2.lower()))

60
87


In [9]:
# Caso 3: ordenamiento diferentes

# token preprocesa los strings: minuscula, elimina puntuaciones (,.?"$")

name1 = "Juan Pablo Villanueva"
name2 = "Villanueva Juan Pablo"

print(fuzz.ratio(name1.lower(),name2.lower()))
print(fuzz.partial_ratio(name1.lower(),name2.lower()))

fuzz.token_sort_ratio(name1.lower(),name2.lower())

48
51


100

In [10]:
# Caso4: Repitición de palabras

name1 = "Juan Pablo Villanueva"
name2 = "Villanueva Villanueva Juan Pablo PABLO"

print(fuzz.ratio(name1.lower(),name2.lower()))
print(fuzz.partial_ratio(name1.lower(),name2.lower()))
print( fuzz.token_sort_ratio(name1.lower(),name2.lower()) )

print( fuzz.token_set_ratio(name1.lower(),name2.lower()) )

47
67
71
100


In [11]:
name1 = "Juan Pablo Villanueva"
name3 = "JuUan Po..%?+ 435, illanuevA$"

print(fuzz.ratio(name1.lower(),name3.lower()))
print(fuzz.partial_ratio(name1,name3))
print( fuzz.token_sort_ratio(name1,name3))
print( fuzz.token_set_ratio(name1,  name3 ) )

68
52
47
47


In [12]:
# Comparación 

lista_nombres = ['juan gutierrez', 'Maria flores', 'Paty nuñez', 'Pablo miranda', 'villa juan']

# ranking y score

print( process.extract(name1, lista_nombres) )

# cantidad minima de match

print(process.extract(name1, lista_nombres, limit =3))

# best score

process.extractOne(name1, lista_nombres)


[('juan gutierrez', 86), ('Pablo miranda', 86), ('villa juan', 86), ('Paty nuñez', 40), ('Maria flores', 36)]
[('juan gutierrez', 86), ('Pablo miranda', 86), ('villa juan', 86)]


('juan gutierrez', 86)

In [13]:
# Usando score diferentes 

#1. fuzz ratio

print( process.extract(name1, lista_nombres, scorer = fuzz.ratio  ))

#2. fuzz partial ratio

print( process.extract(name1, lista_nombres, scorer = fuzz.partial_ratio  ))

#3. fuzz token sort ratio

print( process.extract(name1, lista_nombres, scorer = fuzz.token_sort_ratio  ))

#4. fuzz token set ratio

print( process.extract(name1, lista_nombres, scorer = fuzz.token_set_ratio  ))

[('Pablo miranda', 59), ('villa juan', 45), ('juan gutierrez', 40), ('Paty nuñez', 39), ('Maria flores', 30)]
[('villa juan', 70), ('Pablo miranda', 69), ('juan gutierrez', 43), ('Paty nuñez', 40), ('Maria flores', 33)]
[('villa juan', 65), ('Pablo miranda', 47), ('Maria flores', 30), ('Paty nuñez', 27), ('juan gutierrez', 23)]
[('villa juan', 65), ('Pablo miranda', 59), ('juan gutierrez', 44), ('Maria flores', 30), ('Paty nuñez', 27)]


In [14]:
matches = process.extract(name1, lista_nombres, scorer = fuzz.ratio , limit = 3 )
matches 

[('Pablo miranda', 59), ('villa juan', 45), ('juan gutierrez', 40)]

In [15]:


def therehold(x, min_score):
    
        scoring = x[1]
        
        if scoring < min_score:
            
            return (np.nan, np.nan)
        
        else:
            
            return (x[0], x[1])

list(map(lambda x: therehold(x, 50) , matches))

[('Pablo miranda', 59), (nan, nan), (nan, nan)]

In [16]:
# fuzz.ratio 

def fuzz_ratio(row, column_data2: pd.Series, min_score):
    
    output = process.extract(row, column_data2, scorer = fuzz.ratio , limit = 3  )
    output = list(map(lambda x: therehold(x, min_score) , output))
    return list(itertools.chain(*output))

# fuzz.partial_ratio 

def fuzz_partial_ratio(row, column_data2: pd.Series, min_score):
    
    output = process.extract(row, column_data2, scorer = fuzz.partial_ratio, limit = 3    )
    output = list(map(lambda x: therehold(x, min_score) , output))
    return list(itertools.chain(*output))

# fuzz.token_sort_ratio 

def fuzz_token_sort_ratio(row, column_data2: pd.Series, min_score):
    
    output = process.extract(row, column_data2, scorer = fuzz.token_sort_ratio, limit = 3   )
    output = list(map(lambda x: therehold(x, min_score) , output))
    return list(itertools.chain(*output))

# fuzz.token_sort_ratio 

def fuzz_token_set_ratio(row, column_data2: pd.Series, min_score):
    
    output = process.extract(row, column_data2, scorer = fuzz.token_set_ratio , limit = 3  )
    output = list(map(lambda x: therehold(x, min_score) , output))
    return list(itertools.chain(*output))
 


In [17]:
#  Load dataets 

data_1 = pd.read_excel(r'../data/Fuzzy/nombres.xlsx', sheet_name = 'Hoja1')


data_2 = pd.read_excel(r'../data/Fuzzy/nombres.xlsx', sheet_name = 'Hoja2')

In [18]:
data_1

Unnamed: 0,Nombre,Dirección,Móvil,Email,Sexo,Salud
0,ESTEFANIA AROCAS PASADAS,"PADRÓ , 109",546212121.0,africa@altecom.es,Mujer,Buena
1,QUERALT VISO GILABERT,"CASA CORDELLAS ,",625215452.0,agata@hotmail.com,Mujer,Regular
2,JOAN AYALA FERRERAS,"DOCTOR FLEMING , 11",649212123.0,,Hombre,Buena
3,JOAN BAEZ TEJADO,"BERTRAND I SERRA , 11, 3R.",,albatros@wandoo.es,Hombre,Buena
4,MARC BASTARDES SOTO,"CARRIÓ , 12, 5È A",,albert@intercom.es,Hombre,Regular
...,...,...,...,...,...,...
117,CARLA BOIX GONZÁLEZ,"DE LA CAÇA , 12, 2N., C",624487554.0,tomasa@hotmail.com,Mujer,Buena
118,ADRIÀ BARALDÉS MONRÓS,"VIC , 119, 2N., 1A.",,,Hombre,Regular
119,MARTA AGUILERA MERINO,"MORAGUES , 1",621145584.0,tremenda@altecom.es,Mujer,Regular
120,MARC BAREA D'HAENE,"TRABUCAIRES , 12",,tripa@intercom.es,Hombre,Fatal


In [19]:
data_2

Unnamed: 0,Nombre,Mtematica,Letras
0,AROCAS PASADAS Melisa34,8.0,5.0
1,ESTEFANIA AROCAS PASADAS,9.0,11.0
2,QUERALT VISO GILAaaBERT,12.0,5.0
3,JOAN AYALA FERRERAS,13.0,9.0
4,JOAN BA.-EZ TEJADO,5.0,11.0
...,...,...,...
140,MARTA AGUILERA MERINO,15.0,11.0
141,MARC BAREA D'HAENE,15.0,8.0
142,ALEX BARROSO D'HAENE,13.0,7.0
143,Añexander reynoso,10.0,14.0


In [20]:
# function: elimina tildes, espacios a los aldos y convierte a minuscula


def function1(row):
    
    row = row.strip() 
    row = unidecode.unidecode(row)
    return row.lower()
    
    


# function: elimina tildes, espacios a los aldos, solo se queda con letras y espacios
# , y , finalmente, convierte a minuscula

def function2(row):
    
    row = row.strip() 
    row = unidecode.unidecode(row)
    row = re.sub('[^a-zA-Z\\s]', '',row).lower()
    return row

In [21]:
data_1['Nombre'] = data_1['Nombre'].apply(function1)

data_2['Nombre'] = data_2['Nombre'].apply(function2)

In [22]:
data_1['partial_ratio'] = data_1['Nombre'].swifter.apply(lambda x: fuzz_partial_ratio(x, data_2['Nombre'], min_score = 70)) 

Pandas Apply:   0%|          | 0/122 [00:00<?, ?it/s]

In [23]:
data_1[['partial_ratio']] 

Unnamed: 0,partial_ratio
0,"[estefania arocas pasadas, 100, arocas pasadas..."
1,"[queralt viso gilaaabert, 90, nan, nan, nan, nan]"
2,"[joan ayala ferreras, 100, nan, nan, nan, nan]"
3,"[joan baez tejado, 100, nan, nan, nan, nan]"
4,"[marc bastardes soto, 100, nan, nan, nan, nan]"
...,...
117,"[carla boix gonzalez, 100, nan, nan, nan, nan]"
118,"[adria baraldes monros, 100, cristina baraldes..."
119,"[marta aguilera merino, 100, marcia aguilera m..."
120,"[marc barea dhaene, 94, nan, nan, nan, nan]"


In [24]:
match_partial_ratio = pd.DataFrame(data_1['partial_ratio'].values.tolist())


match_partial_ratio.rename(columns = {0:"partial_match_name_1", 1:"partial_match_score_1",
                          2:"partial_match_name_2", 3:"partial_match_score_2",
                          4:"partial_match_name_3", 5:"partial_match_score_3"}, inplace = True)

data_1 = pd.concat([data_1, match_partial_ratio], axis = 1)

In [25]:
match_partial_ratio

Unnamed: 0,partial_match_name_1,partial_match_score_1,partial_match_name_2,partial_match_score_2,partial_match_name_3,partial_match_score_3
0,estefania arocas pasadas,100.0,arocas pasadas melisa,80.0,,
1,queralt viso gilaaabert,90.0,,,,
2,joan ayala ferreras,100.0,,,,
3,joan baez tejado,100.0,,,,
4,marc bastardes soto,100.0,,,,
...,...,...,...,...,...,...
117,carla boix gonzalez,100.0,,,,
118,adria baraldes monros,100.0,cristina baraldes martorell,71.0,,
119,marta aguilera merino,100.0,marcia aguilera mendpza,81.0,marta guevara molina,75.0
120,marc barea dhaene,94.0,,,,


In [26]:
data_1

Unnamed: 0,Nombre,Dirección,Móvil,Email,Sexo,Salud,partial_ratio,partial_match_name_1,partial_match_score_1,partial_match_name_2,partial_match_score_2,partial_match_name_3,partial_match_score_3
0,estefania arocas pasadas,"PADRÓ , 109",546212121.0,africa@altecom.es,Mujer,Buena,"[estefania arocas pasadas, 100, arocas pasadas...",estefania arocas pasadas,100.0,arocas pasadas melisa,80.0,,
1,queralt viso gilabert,"CASA CORDELLAS ,",625215452.0,agata@hotmail.com,Mujer,Regular,"[queralt viso gilaaabert, 90, nan, nan, nan, nan]",queralt viso gilaaabert,90.0,,,,
2,joan ayala ferreras,"DOCTOR FLEMING , 11",649212123.0,,Hombre,Buena,"[joan ayala ferreras, 100, nan, nan, nan, nan]",joan ayala ferreras,100.0,,,,
3,joan baez tejado,"BERTRAND I SERRA , 11, 3R.",,albatros@wandoo.es,Hombre,Buena,"[joan baez tejado, 100, nan, nan, nan, nan]",joan baez tejado,100.0,,,,
4,marc bastardes soto,"CARRIÓ , 12, 5È A",,albert@intercom.es,Hombre,Regular,"[marc bastardes soto, 100, nan, nan, nan, nan]",marc bastardes soto,100.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,carla boix gonzalez,"DE LA CAÇA , 12, 2N., C",624487554.0,tomasa@hotmail.com,Mujer,Buena,"[carla boix gonzalez, 100, nan, nan, nan, nan]",carla boix gonzalez,100.0,,,,
118,adria baraldes monros,"VIC , 119, 2N., 1A.",,,Hombre,Regular,"[adria baraldes monros, 100, cristina baraldes...",adria baraldes monros,100.0,cristina baraldes martorell,71.0,,
119,marta aguilera merino,"MORAGUES , 1",621145584.0,tremenda@altecom.es,Mujer,Regular,"[marta aguilera merino, 100, marcia aguilera m...",marta aguilera merino,100.0,marcia aguilera mendpza,81.0,marta guevara molina,75.0
120,marc barea d'haene,"TRABUCAIRES , 12",,tripa@intercom.es,Hombre,Fatal,"[marc barea dhaene, 94, nan, nan, nan, nan]",marc barea dhaene,94.0,,,,


In [29]:
data_fuzzy_match = pd.merge(data_1 , data_2, left_on = "partial_match_name_1" ,
                            right_on = "Nombre", how = "left", validate = "m:1", suffixes=('', '_y')).merge(
    data_2, left_on = "partial_match_name_2" ,
                            right_on = "Nombre", how = "left", validate = "m:1", suffixes=('', '_z')
)

data_fuzzy_match

Unnamed: 0,Nombre,Dirección,Móvil,Email,Sexo,Salud,partial_ratio,partial_match_name_1,partial_match_score_1,partial_match_name_2,partial_match_score_2,partial_match_name_3,partial_match_score_3,Nombre_y,Mtematica,Letras,Nombre_z,Mtematica_z,Letras_z
0,estefania arocas pasadas,"PADRÓ , 109",546212121.0,africa@altecom.es,Mujer,Buena,"[estefania arocas pasadas, 100, arocas pasadas...",estefania arocas pasadas,100.0,arocas pasadas melisa,80.0,,,estefania arocas pasadas,9.0,11.0,arocas pasadas melisa,8.0,5.0
1,queralt viso gilabert,"CASA CORDELLAS ,",625215452.0,agata@hotmail.com,Mujer,Regular,"[queralt viso gilaaabert, 90, nan, nan, nan, nan]",queralt viso gilaaabert,90.0,,,,,queralt viso gilaaabert,12.0,5.0,,,
2,joan ayala ferreras,"DOCTOR FLEMING , 11",649212123.0,,Hombre,Buena,"[joan ayala ferreras, 100, nan, nan, nan, nan]",joan ayala ferreras,100.0,,,,,joan ayala ferreras,13.0,9.0,,,
3,joan baez tejado,"BERTRAND I SERRA , 11, 3R.",,albatros@wandoo.es,Hombre,Buena,"[joan baez tejado, 100, nan, nan, nan, nan]",joan baez tejado,100.0,,,,,joan baez tejado,5.0,11.0,,,
4,marc bastardes soto,"CARRIÓ , 12, 5È A",,albert@intercom.es,Hombre,Regular,"[marc bastardes soto, 100, nan, nan, nan, nan]",marc bastardes soto,100.0,,,,,marc bastardes soto,7.0,14.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,carla boix gonzalez,"DE LA CAÇA , 12, 2N., C",624487554.0,tomasa@hotmail.com,Mujer,Buena,"[carla boix gonzalez, 100, nan, nan, nan, nan]",carla boix gonzalez,100.0,,,,,carla boix gonzalez,11.0,11.0,,,
118,adria baraldes monros,"VIC , 119, 2N., 1A.",,,Hombre,Regular,"[adria baraldes monros, 100, cristina baraldes...",adria baraldes monros,100.0,cristina baraldes martorell,71.0,,,adria baraldes monros,12.0,6.0,cristina baraldes martorell,9.0,9.0
119,marta aguilera merino,"MORAGUES , 1",621145584.0,tremenda@altecom.es,Mujer,Regular,"[marta aguilera merino, 100, marcia aguilera m...",marta aguilera merino,100.0,marcia aguilera mendpza,81.0,marta guevara molina,75.0,marta aguilera merino,15.0,11.0,marcia aguilera mendpza,10.0,5.0
120,marc barea d'haene,"TRABUCAIRES , 12",,tripa@intercom.es,Hombre,Fatal,"[marc barea dhaene, 94, nan, nan, nan, nan]",marc barea dhaene,94.0,,,,,marc barea dhaene,15.0,8.0,,,


In [31]:
data_fuzzy_match.rename(columns = {"Mtematica":"Matematica_match_1", "Letras":"Letras_match_1",
                          "Mtematica_z":"Matematica_match_2", "Letras_z":"Letras_match_2"}, inplace = True)

del data_fuzzy_match['Nombre_y']
del data_fuzzy_match['Nombre_z']

data_fuzzy_match

Unnamed: 0,Nombre,Dirección,Móvil,Email,Sexo,Salud,partial_ratio,partial_match_name_1,partial_match_score_1,partial_match_name_2,partial_match_score_2,partial_match_name_3,partial_match_score_3,Matematica_match_1,Letras_match_1,Matematica_match_2,Letras_match_2
0,estefania arocas pasadas,"PADRÓ , 109",546212121.0,africa@altecom.es,Mujer,Buena,"[estefania arocas pasadas, 100, arocas pasadas...",estefania arocas pasadas,100.0,arocas pasadas melisa,80.0,,,9.0,11.0,8.0,5.0
1,queralt viso gilabert,"CASA CORDELLAS ,",625215452.0,agata@hotmail.com,Mujer,Regular,"[queralt viso gilaaabert, 90, nan, nan, nan, nan]",queralt viso gilaaabert,90.0,,,,,12.0,5.0,,
2,joan ayala ferreras,"DOCTOR FLEMING , 11",649212123.0,,Hombre,Buena,"[joan ayala ferreras, 100, nan, nan, nan, nan]",joan ayala ferreras,100.0,,,,,13.0,9.0,,
3,joan baez tejado,"BERTRAND I SERRA , 11, 3R.",,albatros@wandoo.es,Hombre,Buena,"[joan baez tejado, 100, nan, nan, nan, nan]",joan baez tejado,100.0,,,,,5.0,11.0,,
4,marc bastardes soto,"CARRIÓ , 12, 5È A",,albert@intercom.es,Hombre,Regular,"[marc bastardes soto, 100, nan, nan, nan, nan]",marc bastardes soto,100.0,,,,,7.0,14.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,carla boix gonzalez,"DE LA CAÇA , 12, 2N., C",624487554.0,tomasa@hotmail.com,Mujer,Buena,"[carla boix gonzalez, 100, nan, nan, nan, nan]",carla boix gonzalez,100.0,,,,,11.0,11.0,,
118,adria baraldes monros,"VIC , 119, 2N., 1A.",,,Hombre,Regular,"[adria baraldes monros, 100, cristina baraldes...",adria baraldes monros,100.0,cristina baraldes martorell,71.0,,,12.0,6.0,9.0,9.0
119,marta aguilera merino,"MORAGUES , 1",621145584.0,tremenda@altecom.es,Mujer,Regular,"[marta aguilera merino, 100, marcia aguilera m...",marta aguilera merino,100.0,marcia aguilera mendpza,81.0,marta guevara molina,75.0,15.0,11.0,10.0,5.0
120,marc barea d'haene,"TRABUCAIRES , 12",,tripa@intercom.es,Hombre,Fatal,"[marc barea dhaene, 94, nan, nan, nan, nan]",marc barea dhaene,94.0,,,,,15.0,8.0,,
