# Food cleaner code

This code contains the cleaning of the database.

It takes the original database of recipes, an extra database of foods, and adds an extra-column to the original database with the list of ingredients for each recipe.

It also creates a 'cleaned' database 'foods_clean.csv' which does not contain the unused ingredients of the food database.

In [1]:
import os
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import re

%matplotlib inline
pd.options.display.max_rows = 2
pd.options.display.max_colwidth = 1000

In [2]:
if("reload" not in globals()):
    reload = True
    
if(reload):
    db = pd.read_excel('recipeInfo/recipeInfo.xls',encoding='utf8')
    
    db2 = db.dropna(subset=['ingredients_bag-of-words'],axis=0)
    
    reload = False

db_recipes = db2[['title','ingredients_list']].copy(deep=True)
db_recipes['title'] = db_recipes['title'].str.lower()
db_recipes['ingredients_list'] = db_recipes['ingredients_list'].str.lower()
db_recipes.head()

Unnamed: 0,title,ingredients_list
0,easy light chocolate milkshake recipe,"put one half cup of milk, 4 tablespoons of chocolate syrup, vanilla extract and ice to taste in whatever your using to blend it. blend until satisfied with the texture and serve.|now what are you waiting for?|go on! enjoy!"
...,...,...
4,old fashioned butterscotch pie recipe #31698,1 1/2 cup brown sugar|1 cup water|3 eggs|4 tablespoons flour|1 cup milk|4 tablespoons butter|1 teaspoon vanilla extract|1/4 teaspoon salt|1/2 teaspoon cream of tartar|4 tablespoons granulated or powdered sugar


In [3]:
db_foods = pd.read_csv('recipeInfo/foods_modif.csv',encoding='latin1')
db_foods['name'] = db_foods['name'].str.lower()
db_foods['food_group'] = db_foods['food_group'].str.lower()

#sort by length
db_foods.index = db_foods['name'].str.len()
db_foods = db_foods.sort_index(ascending=False).reset_index(drop=True)

# create foods list
foods_list = db_foods['name'].tolist()

db_foods.head()

Unnamed: 0,name,food_group
0,greenland halibut/turbot,aquatic foods
...,...,...
4,arctic ground squirrel,animal foods


In [4]:
non_foods_list = ['teaspsoon','teaspooons','teaspooon','teasopoon','teaspoons',
                  'teaspoon','teas.','virgin']

residual_list = []

def match_foods(x):
    ret_list = []
    x_copy = x + ' '
    for non_food in non_foods_list:
        x_copy = x_copy.replace(non_food,'')
    for food in foods_list:
        if food.endswith('y'):
            if ((food+' ') in x_copy or (food+',') in x_copy or
            (food+'|') in x_copy or (food[:-1]+'ies') in x_copy or
            (food+'s') in x_copy):
                ret_list.append(food)
                x_copy = x_copy.replace(food[:-1],'///') 
        elif food.endswith('o'):
            if ((food+' ') in x_copy or (food+',') in x_copy or
            (food+'|') in x_copy or (food+'es') in x_copy or
            (food+'s') in x_copy):
                ret_list.append(food)
                x_copy = x_copy.replace(food[:-1],'///')
        elif food.endswith('h'):
            if ((food+' ') in x_copy or (food+',') in x_copy or
            (food+'|') in x_copy or (food+'es') in x_copy or
            (food+'s') in x_copy):
                ret_list.append(food)
                x_copy = x_copy.replace(food[:-1],'///')
        elif food.endswith('f'):
            if ((food+' ') in x_copy or (food+',') in x_copy or
            (food+'|') in x_copy or (food[:-1]+'ves') in x_copy or
            (food+'s') in x_copy):
                ret_list.append(food)
                x_copy = x_copy.replace(food[:-1],'///')   
        else: 
            if ((food+' ') in x_copy or (food+',') in x_copy or
            (food+'|') in x_copy or (food+'s') in x_copy):
                ret_list.append(food)
                x_copy = x_copy.replace(food,'///')
                
    residual_list.append(x_copy)
    return ret_list

db_recipes['ingredients'] = db_recipes['ingredients_list'].apply(match_foods)

In [5]:
count = [0]*len(foods_list)

def count_food(x):
    for food in x:
        count[foods_list.index(food)] += 1
db_recipes['ingredients'].apply(count_food)

print(len(count))

1070


In [6]:
db_foods['count'] = count
#sort by count
db_foods.index = db_foods['count']
db_foods = db_foods.sort_index(ascending=False).reset_index(drop=True)
db_foods.head()

Unnamed: 0,name,food_group,count
0,salt,baking goods,25071
...,...,...,...
4,garlic,herbs and spices,13699


In [7]:
db_foods_clean = db_foods[db_foods['count']!=0].reset_index(drop=True)
db_foods_clean.to_csv('recipeInfo/foods_clean.csv')