# Dataset Preparation
Author: Shiyi Wang

In [36]:
import pandas as pd
import numpy as np
from collections import Counter

Load two datasets

In [37]:
interactions = pd.read_csv('../data/interactions.csv', usecols=['user_id','recipe_id','rating'])
recipes = pd.read_csv('../data/recipes.csv', usecols=['name','id'])

Merge two datasets

In [38]:
recipes = recipes.rename(columns={"id": "recipe_id"})
data = interactions.join(recipes.set_index('recipe_id'), on='recipe_id')

Add review count column

In [39]:
def setReviewCountCol(series, name):
    counts = dict(Counter(series))
    return pd.DataFrame.from_dict(counts, orient='index').reset_index().rename(columns={'index': name, 0: f'{name}_count'})

In [40]:
recipe_id_count = setReviewCountCol(interactions.recipe_id,'recipe_id')

Merge with dataset

In [41]:
data = data.merge(recipe_id_count, how='left',
                        left_on='recipe_id', right_on='recipe_id')
data

Unnamed: 0,user_id,recipe_id,rating,name,recipe_id_count
0,38094,40893,4,white bean green chile pepper soup,2
1,1293707,40893,5,white bean green chile pepper soup,2
2,8937,44394,4,devilicious cookie cake delights,1
3,126440,85009,5,baked potato toppings,2
4,57222,85009,5,baked potato toppings,2
...,...,...,...,...,...
1132362,116593,72730,0,cranberry peach maple relish,1
1132363,583662,386618,5,stacey e s yummy veggie burgers,1
1132364,157126,78003,5,pot roast with port stove top,4
1132365,53932,78003,4,pot roast with port stove top,4


Remove those with only one review

In [42]:
data_filtered = data[data['recipe_id_count'] > 1]
data_filtered

Unnamed: 0,user_id,recipe_id,rating,name,recipe_id_count
0,38094,40893,4,white bean green chile pepper soup,2
1,1293707,40893,5,white bean green chile pepper soup,2
3,126440,85009,5,baked potato toppings,2
4,57222,85009,5,baked potato toppings,2
5,52282,120345,4,sugared raspberries,3
...,...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping,19
1132361,102526,54493,0,garlic clove chicken,2
1132364,157126,78003,5,pot roast with port stove top,4
1132365,53932,78003,4,pot roast with port stove top,4


Clean up by dropping `recipe_id_count` column

In [43]:
data_cleaned = data_filtered.drop(['recipe_id_count'], axis=1)
data_cleaned

Unnamed: 0,user_id,recipe_id,rating,name
0,38094,40893,4,white bean green chile pepper soup
1,1293707,40893,5,white bean green chile pepper soup
3,126440,85009,5,baked potato toppings
4,57222,85009,5,baked potato toppings
5,52282,120345,4,sugared raspberries
...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping
1132361,102526,54493,0,garlic clove chicken
1132364,157126,78003,5,pot roast with port stove top
1132365,53932,78003,4,pot roast with port stove top


In [44]:
data_cleaned.to_pickle("../data/processed_data.pkl")
print("Pickle file saved successfully")

Pickle file saved successfully
