In [1]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz, process

### String Similarity & Minimum Edit Distance

### Ex 1: The cutoff point

In this exercise, and throughout this chapter, you'll be working with the `restaurants` DataFrame which has data on various restaurants. Your ultimate goal is to create a restaurant `recommendation engine`, but you need to first clean your data.

This version of restaurants has been collected from many sources, where the `cuisine_type` column is riddled with `typos`, and should contain only `italian`, `american` and `asian` cuisine types. There are so many `unique` categories that `remapping` them `manually` isn't scalable, and it's best to use `string similarity` instead.

In [2]:
restaurants = pd.read_csv("restaurants_L2.csv")
restaurants.head()

Unnamed: 0.1,Unnamed: 0,name,addr,city,phone,type
0,0,arnie morton's of chicago,435 s. la cienega blv .,los angeles,3102461501,american
1,1,art's delicatessen,12224 ventura blvd.,studio city,8187621221,american
2,2,campanile,624 s. la brea ave.,los angeles,2139381447,american
3,3,fenix,8358 sunset blvd. west,hollywood,2138486677,american
4,4,grill on the alley,9560 dayton way,los angeles,3102760615,american


In [3]:
restaurants["type"].unique()

array(['american', 'asian', 'italian', 'coffeebar', 'mexican',
       'southwestern', 'steakhouses', 'southern', 'cajun'], dtype=object)

In [6]:
# Import process from fuzzywuzzy
from fuzzywuzzy import process

# Store the unique values of cuisine_type in unique_types
unique_types = restaurants["type"].unique()
unique_types

array(['american', 'asian', 'italian', 'coffeebar', 'mexican',
       'southwestern', 'steakhouses', 'southern', 'cajun'], dtype=object)

In [7]:
# Calculate similarity of 'asian' to all values of unique_types
print(process.extract('asian', unique_types, limit = len(unique_types)))


[('asian', 100), ('italian', 67), ('american', 62), ('mexican', 50), ('cajun', 40), ('southwestern', 36), ('southern', 31), ('steakhouses', 25), ('coffeebar', 18)]


In [8]:
# Calculate similarity of 'american' to all values of unique_types
print(process.extract('american', unique_types, limit = len(unique_types)))


[('american', 100), ('mexican', 80), ('asian', 62), ('cajun', 54), ('italian', 53), ('southern', 38), ('southwestern', 34), ('coffeebar', 24), ('steakhouses', 21)]


In [9]:
# Calculate similarity of 'italian' to all values of unique_types
print(process.extract("italian", unique_types, limit = len(unique_types)))

[('italian', 100), ('asian', 67), ('american', 53), ('mexican', 43), ('cajun', 33), ('southern', 27), ('southwestern', 26), ('steakhouses', 26), ('coffeebar', 12)]


In [None]:
# As a first step, 
# create a list of all possible matches, comparing 'italian' with the restaurant types listed in the cuisine_type column.

matches = process.extract("italian", restaurants["cuisine_type"], limit =len(restaurants["cuisine_type"]))

# Inspect the first 5 matches
print(matches[0:5])

In [None]:
# Now you can iterate through matches to reassign similar entries.

# Create a list of matches, comparing 'italian' with the cuisine_type column
matches = process.extract('italian', restaurants['cuisine_type'], limit=len(restaurants.cuisine_type))

# Iterate through the list of matches to italian
for match in matches:
  # Check whether the similarity score is greater than or equal to 80
  if match[1] > 80 :
    # Select all rows where the cuisine_type is spelled this way, and set them to the correct cuisine
    restaurants.loc[restaurants["cuisine_type"] == match[0],"cuisine_type"] = 'italian'

In [None]:
# Finally, you'll adapt your code to work with every restaurant type in categories.

# Using the variable cuisine to iterate through categories, 
for cuisine in categories:  
    # Create a list of matches, comparing cuisine with the cuisine_type column
    matches = process.extract(cuisine, restaurants['cuisine_type'], limit=len(restaurants.cuisine_type))

        # Iterate through the list of matches
        for match in matches:
            # Check whether the similarity score is greater than or equal to 80
            if match[1] >= 80:
                # If it is, select all rows where the cuisine_type is spelled this way, and set them to the correct cuisine
                # You want to iterate through categories, and replace elements of your code  
                restaurants.loc[restaurants['cuisine_type'] == match[0], "cuisine_type"] = cuisine
      


# Inspect the final result
print(restaurants['cuisine_type'].unique())