# Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
root_path = '../../Fed_up/data/raw'

recipe_converters = {'tags': eval, 'nutrition': eval, 'steps': eval, 'ingredients': eval}
review_converters = {}

recipes_raw = pd.read_csv(f'{root_path}/RAW_recipes.csv', converters=recipe_converters)
reviews_raw = pd.read_csv(f'{root_path}/RAW_interactions.csv', converters=review_converters)

## Utility Functions

## Recipes

In [3]:
recipes_raw.shape

(231637, 12)

In [4]:
recipes_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [5]:
recipes_raw.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8


In [6]:
recipes = recipes_raw.copy()

### Nutrition  

In [7]:
nutrition_cols = ['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']

def clean_nutrition(df, col='nutrition'):
    data = np.array(df[col].to_list())
    
    for index, nut_col in enumerate(nutrition_cols):
        df[nut_col] = data[:, index].astype(float)
        
    return df[nutrition_cols]

In [8]:
recipes[nutrition_cols] = clean_nutrition(recipes)

In [9]:
nutrition_df = recipes[['name', 'nutrition', 'calories', 'total_fat', 
                        'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']]
nutrition_df.head()

Unnamed: 0,name,nutrition,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [10]:
nutrition_df.isnull().sum()

name             1
nutrition        0
calories         0
total_fat        0
sugar            0
sodium           0
protein          0
saturated_fat    0
carbohydrates    0
dtype: int64

### List Cols (Tags, Steps, Ingredients)

In [11]:
def stringify(col, commas=True):
    if commas:
        new_col = (', ').join(col)
    else:
        new_col = (' ').join(col)
    return new_col

In [12]:
list_cols = ['tags', 'steps', 'ingredients']

def stringify_cols(data, cols, commas=True):
    for col in cols:
        data[col] = data[col].map(lambda x: stringify(x, commas=commas))

In [13]:
stringify_cols(recipes, list_cols)

In [14]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"60-minutes-or-less, time-to-make, course, main...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"make a choice and proceed with recipe, dependi...",autumn is my favorite time of year to cook! th...,"winter squash, mexican seasoning, mixed spice,...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"30-minutes-or-less, time-to-make, course, main...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"preheat oven to 425 degrees f, press dough int...",this recipe calls for the crust to be prebaked...,"prepared pizza crust, sausage patty, eggs, mil...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"time-to-make, course, preparation, main-dish, ...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"brown ground beef in large pot, add chopped on...",this modified version of 'mom's' chili was a h...,"ground beef, yellow onions, diced tomatoes, to...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"60-minutes-or-less, time-to-make, course, main...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,place potatoes in a large pot of lightly salte...,"this is a super easy, great tasting, make ahea...","spreadable cheese with garlic and herbs, new p...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"weeknight, time-to-make, course, main-ingredie...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"mix all ingredients& boil for 2 1 / 2 hours , ...",my dh's amish mother raised him on this recipe...,"tomato juice, apple cider vinegar, sugar, salt...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


### Data Conversion

In [15]:
recipes['submitted'] = pd.to_datetime(recipes['submitted'])
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"60-minutes-or-less, time-to-make, course, main...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"make a choice and proceed with recipe, dependi...",autumn is my favorite time of year to cook! th...,"winter squash, mexican seasoning, mixed spice,...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"30-minutes-or-less, time-to-make, course, main...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"preheat oven to 425 degrees f, press dough int...",this recipe calls for the crust to be prebaked...,"prepared pizza crust, sausage patty, eggs, mil...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"time-to-make, course, preparation, main-dish, ...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"brown ground beef in large pot, add chopped on...",this modified version of 'mom's' chili was a h...,"ground beef, yellow onions, diced tomatoes, to...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"60-minutes-or-less, time-to-make, course, main...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,place potatoes in a large pot of lightly salte...,"this is a super easy, great tasting, make ahea...","spreadable cheese with garlic and herbs, new p...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"weeknight, time-to-make, course, main-ingredie...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"mix all ingredients& boil for 2 1 / 2 hours , ...",my dh's amish mother raised him on this recipe...,"tomato juice, apple cider vinegar, sugar, salt...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


### Metadata

In [19]:
recipes['metadata'] = recipes['tags'] + " " + recipes['ingredients'] \
                      + " " + recipes['steps'] + " " + recipes['description']

### Reordering Columns

In [20]:
recipes.columns

Index(['id', 'contributor_id', 'name', 'minutes', 'n_steps', 'n_ingredients',
       'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat',
       'carbohydrates', 'tags', 'ingredients', 'steps', 'description',
       'submitted', 'metadata'],
      dtype='object')

In [21]:
recipes = pd.DataFrame(recipes, columns=['id', 'contributor_id', 'name', 'minutes','n_steps', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein',
       'saturated_fat', 'carbohydrates', 'tags', 'ingredients', 'steps', 'description', 'metadata', 'submitted'])

In [22]:
recipes.head()

Unnamed: 0,id,contributor_id,name,minutes,n_steps,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates,tags,ingredients,steps,description,metadata,submitted
0,137739,47892,arriba baked winter squash mexican style,55,11,7,51.5,0.0,13.0,0.0,2.0,0.0,4.0,"60-minutes-or-less, time-to-make, course, main...","winter squash, mexican seasoning, mixed spice,...","make a choice and proceed with recipe, dependi...",autumn is my favorite time of year to cook! th...,"60-minutes-or-less, time-to-make, course, main...",2005-09-16
1,31490,26278,a bit different breakfast pizza,30,9,6,173.4,18.0,0.0,17.0,22.0,35.0,1.0,"30-minutes-or-less, time-to-make, course, main...","prepared pizza crust, sausage patty, eggs, mil...","preheat oven to 425 degrees f, press dough int...",this recipe calls for the crust to be prebaked...,"30-minutes-or-less, time-to-make, course, main...",2002-06-17
2,112140,196586,all in the kitchen chili,130,6,13,269.8,22.0,32.0,48.0,39.0,27.0,5.0,"time-to-make, course, preparation, main-dish, ...","ground beef, yellow onions, diced tomatoes, to...","brown ground beef in large pot, add chopped on...",this modified version of 'mom's' chili was a h...,"time-to-make, course, preparation, main-dish, ...",2005-02-25
3,59389,68585,alouette potatoes,45,11,11,368.1,17.0,10.0,2.0,14.0,8.0,20.0,"60-minutes-or-less, time-to-make, course, main...","spreadable cheese with garlic and herbs, new p...",place potatoes in a large pot of lightly salte...,"this is a super easy, great tasting, make ahea...","60-minutes-or-less, time-to-make, course, main...",2003-04-14
4,44061,41706,amish tomato ketchup for canning,190,5,8,352.9,1.0,337.0,23.0,3.0,0.0,28.0,"weeknight, time-to-make, course, main-ingredie...","tomato juice, apple cider vinegar, sugar, salt...","mix all ingredients& boil for 2 1 / 2 hours , ...",my dh's amish mother raised him on this recipe...,"weeknight, time-to-make, course, main-ingredie...",2002-10-25


In [23]:
recipes['metadata'][0]

'60-minutes-or-less, time-to-make, course, main-ingredient, cuisine, preparation, occasion, north-american, side-dishes, vegetables, mexican, easy, fall, holiday-event, vegetarian, winter, dietary, christmas, seasonal, squash winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt make a choice and proceed with recipe, depending on size of squash , cut into half or fourths, remove seeds, for spicy squash , drizzle olive oil or melted butter over each cut squash piece, season with mexican seasoning mix ii, for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece, season with sweet mexican spice mix, bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin, be careful not to burn the squash especially if you opt to use sugar or butter, if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking, if desired , seaso