In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
root_path = '../../00. Data'
recipes_raw = pd.read_csv(f'{root_path}/RAW_recipes.csv')
reviews_raw = pd.read_csv(f'{root_path}/RAW_interactions.csv')

# Preprocessing

## Utils

### String cleanup

In [None]:
def string_cleanup(obj, numeric=False):
    pass 

## Recipes

In [28]:
recipes_raw.shape

(231637, 12)

In [29]:
recipes_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [31]:
recipes_raw.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [33]:
recipes = recipes_raw.copy()

### Splitting nutrition data

### Selecting tags

### Converting pseudo-lists

### Creating metadata

### Scaling numeric data

In [12]:
recipes_pp = recipes_raw.copy()

recipes_pp['tags'] = recipes_raw['tags'].map(lambda x: x[1:-1].replace("'", ""))
recipes_pp['steps'] = recipes_raw['steps'].map(lambda x: x[1:-1].replace("'", ""))
recipes_pp['ingredients'] = recipes_raw['ingredients'].map(lambda x: x[1:-1].replace("'", ""))
recipes_pp['metadata'] = \
    recipes_pp['tags'] + " " + recipes_pp['steps'] + " " + \
    recipes_pp['description'] + " " + recipes_pp['ingredients']

In [13]:
recipes_pp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
 12  metadata        226658 non-null  object
dtypes: int64(5), object(8)
memory usage: 23.0+ MB


In [14]:
recipes_pp.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,metadata
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"60-minutes-or-less, time-to-make, course, main...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"make a choice and proceed with recipe, dependi...",autumn is my favorite time of year to cook! th...,"winter squash, mexican seasoning, mixed spice,...",7,"60-minutes-or-less, time-to-make, course, main..."
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"30-minutes-or-less, time-to-make, course, main...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"preheat oven to 425 degrees f, press dough int...",this recipe calls for the crust to be prebaked...,"prepared pizza crust, sausage patty, eggs, mil...",6,"30-minutes-or-less, time-to-make, course, main..."
2,all in the kitchen chili,112140,130,196586,2005-02-25,"time-to-make, course, preparation, main-dish, ...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"brown ground beef in large pot, add chopped on...",this modified version of 'mom's' chili was a h...,"ground beef, yellow onions, diced tomatoes, to...",13,"time-to-make, course, preparation, main-dish, ..."
3,alouette potatoes,59389,45,68585,2003-04-14,"60-minutes-or-less, time-to-make, course, main...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,place potatoes in a large pot of lightly salte...,"this is a super easy, great tasting, make ahea...","spreadable cheese with garlic and herbs, new p...",11,"60-minutes-or-less, time-to-make, course, main..."
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"weeknight, time-to-make, course, main-ingredie...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"mix all ingredients& boil for 2 1 / 2 hours , ...",my dh's amish mother raised him on this recipe...,"tomato juice, apple cider vinegar, sugar, salt...",8,"weeknight, time-to-make, course, main-ingredie..."


In [20]:
recipes_pp['metadata'].isnull().sum()

4979

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(recipes_pp['metadata'])
count_df = pd.DataFrame(count_matrix.toarray(), index=recipes_pp.index.tolist())
count_df.head()

ValueError: np.nan is an invalid document, expected byte or unicode string.