# Flexible string matching using Fuzzywuzzy

In [1]:
import pandas as pd
df = pd.read_excel(r'/Users/raj/Besant Technologies OMR /roomtype.xlsx')
df.head(10)

Unnamed: 0,Expedia,Booking.com
0,"Deluxe Room, 1 King Bed",Deluxe King Room
1,"Standard Room, 1 King Bed, Accessible",Standard King Roll-in Shower Accessible
2,"Grand Corner King Room, 1 King Bed",Grand Corner King Room
3,"Suite, 1 King Bed (Parlor)",King Parlor Suite
4,"High-Floor Premium Room, 1 King Bed",High-Floor Premium King Room
5,"Traditional Double Room, 2 Double Beds",Double Room with Two Double Beds
6,"Room, 1 King Bed, Accessible",King Room - Disability Access
7,"Deluxe Room, 1 King Bed",Deluxe King Room
8,Deluxe Room,Deluxe Room (Non Refundable)
9,"Room, 2 Double Beds (19th to 25th Floors)",Two Double Beds - Location Room (19th to 25th ...


In [6]:
from fuzzywuzzy import fuzz
fuzz.ratio('Deluxe Room, 1 King Bed', 'Deluxe King Room')

62

In [7]:
fuzz.ratio('Traditional Double Room, 2 Double Beds', 'Double Room with Two Double Beds')

69

In [8]:
fuzz.ratio('Room, 2 Double Beds (19th to 25th Floors)', 'Two Double Beds - Location Room (19th to 25th Floors)')

74

In [9]:
fuzz.partial_ratio('Traditional Double Room, 2 Double Beds', 'Double Room with Two Double Beds')

83

In [10]:
fuzz.partial_ratio('Room, 2 Double Beds (19th to 25th Floors)', 'Two Double Beds - Location Room (19th to 25th Floors)')

63

# comparing partial string does not bring better results overall. So lets use token sort ration where the word order is ignored

In [11]:
print(fuzz.token_sort_ratio('Deluxe Room, 1 King Bed', 'Deluxe King Room'))
print(fuzz.token_sort_ratio('Traditional Double Room, 2 Double Beds', 'Double Room with Two Double Beds'))
print(fuzz.token_sort_ratio('Room, 2 Double Beds (19th to 25th Floors)', 'Two Double Beds - Location Room (19th to 25th Floors)'))

84
78
83


In [45]:
print(fuzz.token_sort_ratio('Deluxe Room', 'Deluxe Room'))

100


# still there is room for improvement. Lets us ignore duplicate words by using token set ratio

In [12]:
print(fuzz.token_set_ratio('Deluxe Room, 1 King Bed', 'Deluxe King Room'))
print(fuzz.token_set_ratio('Traditional Double Room, 2 Double Beds', 'Double Room with Two Double Beds'))
print(fuzz.token_set_ratio('Room, 2 Double Beds (19th to 25th Floors)', 'Two Double Beds - Location Room (19th to 25th Floors)'))

100
78
97


In [46]:
def get_ratio(row):
    name = row['Expedia']
    name1 = row['Booking.com']
    return fuzz.token_set_ratio(name, name1)


In [47]:
df[df.apply(get_ratio, axis=1) > 70]

Unnamed: 0,Expedia,Booking.com
0,"Deluxe Room, 1 King Bed",Deluxe King Room
1,"Standard Room, 1 King Bed, Accessible",Standard King Roll-in Shower Accessible
2,"Grand Corner King Room, 1 King Bed",Grand Corner King Room
3,"Suite, 1 King Bed (Parlor)",King Parlor Suite
4,"High-Floor Premium Room, 1 King Bed",High-Floor Premium King Room
5,"Traditional Double Room, 2 Double Beds",Double Room with Two Double Beds
6,"Room, 1 King Bed, Accessible",King Room - Disability Access
7,"Deluxe Room, 1 King Bed",Deluxe King Room
8,Deluxe Room,Deluxe Room (Non Refundable)
9,"Room, 2 Double Beds (19th to 25th Floors)",Two Double Beds - Location Room (19th to 25th ...


In [16]:
len(df[df.apply(get_ratio, axis=1) > 70]) / len(df)

0.9029126213592233

# Phonetic Matching

In [20]:
print(fuzz.ratio('misisepi','Mississippi'))
print(fuzz.ratio('misisssssssssepi','Mississippi'))

63
52


In [49]:
import jellyfish
import fuzzy
soundex = fuzzy.Soundex(6)
print(soundex('misisepilakmop'))
print(soundex('Mississippi'))
print(jellyfish.soundex('misisepi'))
print(jellyfish.soundex('Mississippi'))

M21425
M21425
M221
M221


In [23]:
print(jellyfish.soundex('misisssssssssepi'))
print(jellyfish.soundex('Mississippi'))

M221
M221


In [39]:
code1 = jellyfish.soundex('misisssssssssepi')
code2 = jellyfish.soundex('Mississippi')
print(code1 , code2)
fuzz.ratio(code1,code2)

M221 M221


100

In [40]:
code1 = jellyfish.soundex('vladimir putin')
code2 = jellyfish.soundex('Vladimir Kirillovich')
print(code1 , code2)
fuzz.ratio(code1,code2)

V435 V435


100

# Metaphone

In [37]:
from phonetics import metaphone as mt

In [41]:
code1 = mt('vladimir putin')
code2 = mt('Vladimir Kirillovich')
print(code1 , code2)
fuzz.ratio(code1,code2)

FLTMRPTN FLTMRKRLFX


56

In [42]:
print(mt('misisssssssssepi'))
print(mt('Mississippi'))

MSSSSSSP
MSSP
