# Dataset Preparation
Author: Shiyi Wang

In [32]:
import pandas as pd
import numpy as np
from collections import Counter

Load two datasets

In [33]:
interactions = pd.read_csv('./data/interactions.csv', usecols=['user_id','recipe_id','rating'])
recipes = pd.read_csv('./data/recipes.csv', usecols=['name','id'])

Merge two datasets

In [34]:
recipes = recipes.rename(columns={"id": "recipe_id"})
data = interactions.join(recipes.set_index('recipe_id'), on='recipe_id')

Make a copy for faster process

In [35]:
data_copy = data

Add review count column

In [36]:
def setReviewCountCol(series, name):
    counts = dict(Counter(series))
    return pd.DataFrame.from_dict(counts, orient='index').reset_index().rename(columns={'index': name, 0: f'{name}_count'})

In [37]:
recipe_id_count = setReviewCountCol(interactions.recipe_id,'recipe_id')

Merge with dataset

In [38]:
data = data.merge(recipe_id_count, how='left',
                        left_on='recipe_id', right_on='recipe_id')

Remove those with only one review

In [39]:
data_filtered = data[data['recipe_id_count'] > 1]

Clean up by dropping `recipe_id_count` column

In [40]:
data_cleaned = data_filtered.drop(['recipe_id_count'], axis=1)
data_cleaned

Unnamed: 0,user_id,recipe_id,rating,name
0,38094,40893,4,white bean green chile pepper soup
1,1293707,40893,5,white bean green chile pepper soup
3,126440,85009,5,baked potato toppings
4,57222,85009,5,baked potato toppings
5,52282,120345,4,sugared raspberries
...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping
1132361,102526,54493,0,garlic clove chicken
1132364,157126,78003,5,pot roast with port stove top
1132365,53932,78003,4,pot roast with port stove top


In [41]:
data_cleaned.to_pickle("./output/processed_data.pkl")