# Preprocessing

## Objectives:
- Remove records with no menu description. 
- Remove stop words from menu descriptions.
- Remove punctuation from menu descriptions. 
- Convert menu descriptions to lower case.
- Tokenize menu descriptions.
- Add columns to indicate if certain ingredients are present in the dish.


In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
dishes = pd.read_csv('data/dishes-2020-09-15.csv')
dish_ingredients = pd.read_csv('data/dish-ingredients-2020-09-15.csv')
ingredients = pd.read_csv('data/ingredients-2020-09-15.csv')

In [None]:
print(dishes.shape)
dishes.head()

In [None]:
print(dish_ingredients.shape)
dish_ingredients.head()

In [None]:
print(ingredients.shape)
ingredients.head()

In [None]:
dishes['menu_description'].value_counts()

In [None]:
# Removing rows with NaN menu description
old_shape = dishes.shape
dishes = dishes.dropna(subset=['menu_description'])
print(f'Removing NaN Menu descriptions: {old_shape} -> {dishes.shape}')

# Subsampling because there's just too much data to process. 
# dishes = dishes.sample(n=20000, random_state=42)
# print(f'Randomly sampling 20K rows: -> {dishes.shape}')
dishes = dishes.sample(n=8000, random_state=42)
print(f'Randomly sampling 8K rows: -> {dishes.shape}')

# Converting to lowercase
dishes['menu_description'] = dishes['menu_description'].str.lower()

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))

def tokenize_and_process(desc):
    tokens = nltk.word_tokenize(desc)
    
    # Removing stop words and punctuation from menu descriptions, and stem what's left.
    tokens = [t for t in tokens if t not in stop_words and t.isalpha()]
    
    return " ".join(tokens)

In [None]:
dishes['cleaned_descriptions'] = dishes['menu_description'].apply(tokenize_and_process)

In [None]:
dishes.head(3)

### Quick Exploration of Datasets

In [None]:
print(dishes[dishes['dish_id'] == '8c310c8c-2461-4360-b10e-d21f331d8a4f']['cleaned_descriptions'].iloc[0])
print(dish_ingredients[dish_ingredients['dish_id'] == '8c310c8c-2461-4360-b10e-d21f331d8a4f']['ingredient_id'])
ingredients[ingredients['ingredient_id'] == 'a87ae1e6-c7a0-439c-a014-4e91c6c15343']

In [None]:
print(dishes[dishes['dish_id'] == 'a0c36965-70c8-461d-bc05-23a913e5c87b']['cleaned_descriptions'].iloc[0])
print(dish_ingredients[dish_ingredients['dish_id'] == 'a0c36965-70c8-461d-bc05-23a913e5c87b']['ingredient_id'])
ingredients[ingredients['ingredient_id'] == 'c67ee877-2b18-4cc7-beb2-7372da69a3ed']

In [None]:
dishes[dishes['dish_id'] == 'a0c36965-70c8-461d-bc05-23a913e5c87b']

This discovery shows that our data isn't as reliable as we thought it would be. The dish_ingredients data says that the Chicken Bacon Ranch dish (dish_id = 8c310c8c-2461-4360-b10e-d21f331d8a4f) only has 1 ingredient associated with that, and that ingredient is salad dressing, ranch dressing, regular. 

In the next couple of steps, we will still use the dish_ingredients data but also process the descriptions to look for keywords that may help us identify which ingredients are in the dish. We believe that adding the description as part of the extraction for our labels will increase the accuracy of our model, since currently, we are having a lot of false negatives. Having a lot of false negatives isn't ideal in our situation; in fact, it would be better to have more false positives because it decreases the risk of someone having an allergic reaction or eating food that doesn't fit their diet.

### Extracting Labels 

In [None]:
def get_ingredient_ids(ingredient_name):
    # Returns a Series of ingredient_id's for a common name, e.g. 'peanut', or 'banana'. 
    return ingredients[ingredients['ingredient_name'].str.contains(ingredient_name)]['ingredient_id']

def contains_ingredient(dish_id, ingredient):
    ingredient_ids = set(get_ingredient_ids(ingredient))
    dish_ingredient_id_list = set(dish_ingredients[dish_ingredients['dish_id'] == (dish_id)]['ingredient_id'])
    dish_description = dishes[dishes['dish_id'] == dish_id]['cleaned_descriptions']
    in_description = dish_description.str.contains(ingredient).any()
    return len(ingredient_ids.intersection(dish_ingredient_id_list)) > 0 or in_description

def contains_ingredients(dish_id, ingredients):
    # Same as contains_ingredient, but for a list of ingredients. 
    ingredient_ids = set()
    dish_description = dishes[dishes['dish_id'] == dish_id]['cleaned_descriptions']
    in_description = []
    for i in ingredients:
        ingredient_ids = ingredient_ids.union(set(get_ingredient_ids(i)))
        in_description.append(dish_description.str.contains(i).any()) 
    dish_ingredient_id_list = set(dish_ingredients[dish_ingredients['dish_id'] == (dish_id)]['ingredient_id'])
    return len(ingredient_ids.intersection(dish_ingredient_id_list)) > 0 or any(in_description)

In [None]:
dish_ids = dishes['dish_id']

dishes['contains_peanuts'] = dish_ids.apply(lambda id: contains_ingredients(id, ['peanut', 'peanuts']))

In [None]:
dishes['contains_egg'] = dish_ids.apply(lambda id: contains_ingredients(id, [' egg', 'eggs', 'egg'])) # YES, THE SPACE IS NECESSARY

In [None]:
dishes['contains_sesame'] = dish_ids.apply(lambda id: contains_ingredient(id, 'sesame'))

In [None]:
dishes['contains_fish'] = dish_ids.apply(
    lambda id: contains_ingredients(id, ['pollock', 'carp', 'cod', 'dogfish', 'mackerel', 'salmon', 'sole', 'tuna'])
)

In [None]:
dishes['contains_shellfish'] = dish_ids.apply(lambda id: contains_ingredients(id, ['crab', 'lobster', 'shrimp']))

In [None]:
dishes['contains_soy'] = dish_ids.apply(lambda id: contains_ingredients(id, ['soy', 'tofu']))

In [None]:
dishes['contains_meat'] = dish_ids.apply(
    lambda id: contains_ingredients(id, ['meat', 'fish', 'beef', 'steak', 'pork', 'bacon', 'chicken', 'duck', 
                                        'turkey', 'ham', 'salami', 'pheasant', 'goat', 'bison', 'boar'])
)

In [None]:
# Convert all the contains_* columns from booleans to integers
for i in ['peanuts', 'egg', 'sesame', 'fish', 'shellfish', 'soy', 'meat']:
    label = f'contains_{i}'
    dishes[label] = dishes[label].astype(int)
dishes.head()

In [None]:
dishes.to_csv('data/processed_dishes_v2.csv', index=False)