# Preprocessing

## Objectives:
- Remove records with no menu description. 
- Remove stop words from menu descriptions.
- Remove punctuation from menu descriptions. 
- Convert menu descriptions to lower case.
- Tokenize menu descriptions.
- Add columns to indicate if certain ingredients are present in the dish.


In [None]:
import pandas as pd
import nltk

In [None]:
dishes = pd.read_csv('data/dishes-2020-09-15.csv')
dish_ingredients = pd.read_csv('data/dish-ingredients-2020-09-15.csv')
ingredients = pd.read_csv('data/ingredients-2020-09-15.csv')

In [None]:
print(dishes.shape)
dishes.head()

In [None]:
print(dish_ingredients.shape)
dish_ingredients.head()

In [None]:
print(ingredients.shape)
ingredients.head()

In [None]:
dishes['menu_description'].value_counts()

In [None]:
# Removing rows with NaN menu description
old_shape = dishes.shape
dishes = dishes.dropna(subset=['menu_description'])
print(f'Removing NaN Menu descriptions: {old_shape} -> {dishes.shape}')

# Subsampling because there's just too much data to process. 
dishes = dishes.sample(n=20000, random_state=42)
print(f'Randomly sampling 20K rows: -> {dishes.shape}')

# Converting to lowercase
dishes['menu_description'] = dishes['menu_description'].str.lower()

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))

def tokenize_and_process(desc):
    tokens = nltk.word_tokenize(desc)
    
    # Removing stop words and punctuation from menu descriptions, and stem what's left.
    tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
    
    return " ".join(tokens)

In [None]:
dishes['cleaned_descriptions'] = dishes['menu_description'].apply(tokenize_and_process)

In [None]:
dishes.head(3)

In [None]:
def get_ingredient_ids(ingredient_name):
    # Returns a Series of ingredient_id's for a common name, e.g. 'peanut', or 'banana'. 
    return ingredients[ingredients['ingredient_name'].str.contains(ingredient_name)]['ingredient_id']

def contains_ingredient(dish_id, ingredient):
    ingredient_ids = set(get_ingredient_ids(ingredient))
    dish_ingredient_id_list = set(dish_ingredients[dish_ingredients['dish_id'] == (dish_id)]['ingredient_id'])
    return len(ingredient_ids.intersection(dish_ingredient_id_list)) > 0

def contains_ingredients(dish_id, ingredients):
    # Same as contains_ingredient, but for a list of ingredients. 
    ingredient_ids = set()
    for i in ingredients:
        ingredient_ids = ingredient_ids.union(set(get_ingredient_ids(i)))
    dish_ingredient_id_list = set(dish_ingredients[dish_ingredients['dish_id'] == (dish_id)]['ingredient_id'])
    return len(ingredient_ids.intersection(dish_ingredient_id_list)) > 0

In [None]:
dish_ids = dishes['dish_id']

dishes['contains_peanuts'] = dish_ids.apply(lambda id: contains_ingredient(id, 'peanut'))

In [None]:
dishes['contains_egg'] = dish_ids.apply(lambda id: contains_ingredient(id, ' egg')) # YES, THE SPACE IS NECESSARY

In [None]:
dishes['contains_sesame'] = dish_ids.apply(lambda id: contains_ingredient(id, 'sesame'))

In [None]:
dishes['contains_fish'] = dish_ids.apply(
    lambda id: contains_ingredients(id, ['pollock', 'carp', 'cod', 'dogfish', 'mackerel', 'salmon', 'sole', 'tuna'])
)

In [None]:
dishes['contains_shellfish'] = dish_ids.apply(lambda id: contains_ingredients(id, ['crab', 'lobster', 'shrimp']))

In [None]:
dishes['contains_soy'] = dish_ids.apply(lambda id: contains_ingredient(id, 'soy'))

In [None]:
dishes['contains_meat'] = dish_ids.apply(
    lambda id: contains_ingredients(id, ['meat', 'fish', 'beef', 'steak', 'pork', 'bacon', 'chicken', 'duck', 
                                        'turkey', 'ham', 'salami', 'pheasant', 'goat', 'bison', 'boar'])
)

In [None]:
# Convert all the contains_* columns from booleans to integers
for i in ['peanuts', 'egg', 'sesame', 'fish', 'shellfish', 'soy', 'meat']:
    label = f'contains_{i}'
    dishes[label] = dishes[label].astype(int)
dishes.head()

In [None]:
dishes.to_csv('data/processed_dishes.csv', index=False)