# Dataset Preparation
Author: Shiyi Wang

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

Load two datasets

In [2]:
interactions = pd.read_csv('../data/interactions.csv', usecols=['user_id','recipe_id','rating'])
recipes = pd.read_csv('../data/recipes.csv', usecols=['name','id'])

Merge two datasets

In [3]:
recipes = recipes.rename(columns={"id": "recipe_id"})
data = interactions.join(recipes.set_index('recipe_id'), on='recipe_id')

Add review count column

In [4]:
def setReviewCountCol(series, name):
    counts = dict(Counter(series))
    return pd.DataFrame.from_dict(counts, orient='index').reset_index().rename(columns={'index': name, 0: f'{name}_count'})

In [5]:
recipe_id_count = setReviewCountCol(interactions.recipe_id,'recipe_id')

Merge with dataset

In [6]:
data = data.merge(recipe_id_count, how='left',
                        left_on='recipe_id', right_on='recipe_id')
data

Unnamed: 0,user_id,recipe_id,rating,name,recipe_id_count
0,38094,40893,4,white bean green chile pepper soup,2
1,1293707,40893,5,white bean green chile pepper soup,2
2,8937,44394,4,devilicious cookie cake delights,1
3,126440,85009,5,baked potato toppings,2
4,57222,85009,5,baked potato toppings,2
...,...,...,...,...,...
1132362,116593,72730,0,cranberry peach maple relish,1
1132363,583662,386618,5,stacey e s yummy veggie burgers,1
1132364,157126,78003,5,pot roast with port stove top,4
1132365,53932,78003,4,pot roast with port stove top,4


Remove those with only one review

In [7]:
data_filtered = data[data['recipe_id_count'] > 1]
data_filtered

Unnamed: 0,user_id,recipe_id,rating,name,recipe_id_count
0,38094,40893,4,white bean green chile pepper soup,2
1,1293707,40893,5,white bean green chile pepper soup,2
3,126440,85009,5,baked potato toppings,2
4,57222,85009,5,baked potato toppings,2
5,52282,120345,4,sugared raspberries,3
...,...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping,19
1132361,102526,54493,0,garlic clove chicken,2
1132364,157126,78003,5,pot roast with port stove top,4
1132365,53932,78003,4,pot roast with port stove top,4


Clean up by dropping `recipe_id_count` column

In [8]:
data_cleaned = data_filtered.drop(['recipe_id_count'], axis=1)
data_cleaned

Unnamed: 0,user_id,recipe_id,rating,name
0,38094,40893,4,white bean green chile pepper soup
1,1293707,40893,5,white bean green chile pepper soup
3,126440,85009,5,baked potato toppings
4,57222,85009,5,baked potato toppings
5,52282,120345,4,sugared raspberries
...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping
1132361,102526,54493,0,garlic clove chicken
1132364,157126,78003,5,pot roast with port stove top
1132365,53932,78003,4,pot roast with port stove top


Reserialize Recipe and User IDs

In [9]:
def reserialize(values):
    unique_values = np.unique(values)
    return dict([(x, y) for y, x in enumerate(unique_values)])

In [10]:
reserialized_User_ID = reserialize(data_cleaned.user_id)
reserialized_Recipe_ID = reserialize(data_cleaned.recipe_id)

In [11]:
user_ID_converted_dataframe = pd.DataFrame.from_dict(reserialized_User_ID, orient='index').reset_index(
).rename(columns={'index': 'user_id', 0: 'new_user_id'})
recipe_ID_converted_dataframe = pd.DataFrame.from_dict(reserialized_Recipe_ID, orient='index').reset_index(
).rename(columns={'index': 'recipe_id', 0: 'new_recipe_id'})

Integrate new serialization into dataset

In [12]:
data_reserialized = data_cleaned.join(user_ID_converted_dataframe.set_index('user_id'), on='user_id')
data_reserialized = data_reserialized .join(recipe_ID_converted_dataframe.set_index('recipe_id'), on='recipe_id')
data_reserialized

Unnamed: 0,user_id,recipe_id,rating,name,new_user_id,new_recipe_id
0,38094,40893,4,white bean green chile pepper soup,3787,16642
1,1293707,40893,5,white bean green chile pepper soup,95286,16642
3,126440,85009,5,baked potato toppings,14502,34897
4,57222,85009,5,baked potato toppings,6559,34897
5,52282,120345,4,sugared raspberries,5690,49598
...,...,...,...,...,...,...
1132360,2002357020,82303,5,easy microwave hot fudge topping,215681,33795
1132361,102526,54493,0,garlic clove chicken,11621,22415
1132364,157126,78003,5,pot roast with port stove top,17831,32160
1132365,53932,78003,4,pot roast with port stove top,5947,32160


Finalize dataset by keeping the new IDs and removing the old ones.

In [13]:
data_finalized = data_reserialized.drop(['user_id', 'recipe_id'], axis= 1)
data_finalized = data_finalized.rename(columns={"new_user_id": "user_id", "new_recipe_id": "recipe_id"})


Finalized dataset

In [14]:
data_finalized

Unnamed: 0,rating,name,user_id,recipe_id
0,4,white bean green chile pepper soup,3787,16642
1,5,white bean green chile pepper soup,95286,16642
3,5,baked potato toppings,14502,34897
4,5,baked potato toppings,6559,34897
5,4,sugared raspberries,5690,49598
...,...,...,...,...
1132360,5,easy microwave hot fudge topping,215681,33795
1132361,0,garlic clove chicken,11621,22415
1132364,5,pot roast with port stove top,17831,32160
1132365,4,pot roast with port stove top,5947,32160


Save as Pickle file

In [15]:
data_finalized.to_pickle("../data/processed_data.pkl")
print("Pickle file saved successfully")

Pickle file saved successfully
