In [1]:
import numpy as np
import pandas as pd

In [2]:
from fuzzywuzzy import fuzz, process

### String Similarity & Minimum Edit Distance

Minimum edit distance is used to identify how similar two strings are. It is the minimum number of steps needed to reach from String A to String B, with the operations available being:

1.`Insertion` of a new character.

2.`Deletion` of an existing character.

3.`Substitution` of an existing character.

4.`Transposition` of two existing consecutive characters.

In [3]:
# Lets compare between two strings by fuzz.WRatio()
# It gives a similarity ratio betn two strings with 0 being not similar at all to 100 being fully similar
# Compare reeding vs reading

fuzz.WRatio("Reeding", "Reading")

86

In [4]:
## We can see that 'Reeding' and 'Reading' are 86% similar to each other
## we can get partial strings similarity by it

# Partial string comparison
fuzz.WRatio('Houston Rockets', 'Rockets')

90

In [5]:
# ## we can also get partial string comarison of different orders similarity by it
# Partial string comparison with different order

fuzz.WRatio('Houston Rockets vs Los Angeles Lakers', 'Lakers vs Rockets')

86

In [6]:
## We can compare strings with array of strings by fuzzywuzz's extract function in process module

# string 
string = "Houston Rockets vs Los Angeles Lakers"

# array of possible matches
choices = pd.Series(['Rockets vs Lakers', 'Lakers vs Rockets',
'Houson vs Los Angeles', 'Heat vs Bulls'])

# now we'll use process module and extract function to find the similarity ratio

process.extract(string, choices)

[('Rockets vs Lakers', 86, 0),
 ('Lakers vs Rockets', 86, 1),
 ('Houson vs Los Angeles', 86, 2),
 ('Heat vs Bulls', 86, 3)]

In [7]:
## This extract function outputs tuple of strings where 3 values are in one tuple---

# 1st value is the array value to compare
# 2nd value is the ratio
# 3rd value is the element index value in the array

## We can also set limit in our values

process.extract(string, choices, limit = 2)

[('Rockets vs Lakers', 86, 0), ('Lakers vs Rockets', 86, 1)]

### Collapsing categories with string matching

When manually replacing of values are not possible, we can use string similarity

In [10]:
state = ["California", "Cali", "Calefornia", "Calefernia", "New York", "Newark", "New York City", "NYC"]

In [11]:
data = {"state":state}

In [12]:
survey = pd.DataFrame(data)
survey

Unnamed: 0,state
0,California
1,Cali
2,Calefornia
3,Calefernia
4,New York
5,Newark
6,New York City
7,NYC


In [13]:
categories = pd.DataFrame(data= {"state" : ["New York", "California"]})
categories

Unnamed: 0,state
0,New York
1,California


In [14]:
# For each correct category
for state in categories['state']:
    # Find potential matches in states with typoes
    print("state :", state)
    matches = process.extract( state, survey['state'], limit = survey.shape[0])
    print("Match Similarity:", matches)
    # For each potential match match
    for potential_match in matches:
        # If high similarity score
        if potential_match[1] >= 70:
            # Replace typo with correct category
            survey.loc[survey['state'] == potential_match[0], 'state'] = state
            print(state)

state : New York
Match Similarity: [('New York', 100, 4), ('New York City', 90, 6), ('Newark', 71, 5), ('NYC', 36, 7), ('Calefornia', 33, 2), ('California', 22, 0), ('Calefernia', 22, 3), ('Cali', 0, 1)]
New York
New York
New York
state : California
Match Similarity: [('California', 100, 0), ('Cali', 90, 1), ('Calefornia', 90, 2), ('Calefernia', 80, 3), ('NYC', 30, 7), ('New York', 22, 4), ('New York', 22, 5), ('New York', 22, 6)]
California
California
California
California


In [15]:
# Inspect the final result
print(survey['state'].unique())

['California' 'New York' 'NYC']
