In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [2]:
df_made_train = pd.read_csv("datasets/recipes_made_train.csv")
df_made_test = pd.read_csv("datasets/recipes_made_test.csv")
df_made_val = pd.read_csv("datasets/recipes_made_val.csv")
df_raw_interactions = pd.read_csv("datasets/RAW_interactions.csv")
df_raw_recipes = pd.read_csv("datasets/RAW_recipes.csv")
df_pp_users = pd.read_csv("datasets/PP_users.csv")
df_pp_recipes = pd.read_csv("datasets/PP_recipes.csv")
df_train = pd.read_csv("datasets/interactions_train.csv")
df_test = pd.read_csv("datasets/interactions_test.csv")
df_val = pd.read_csv("datasets/interactions_validation.csv")
df_ingr_map = pd.read_csv("datasets/ingr_map.csv")

In [3]:
def convToMap(data, value_cols):
    if isinstance(value_cols, str):
        value_cols = [value_cols]

    for col in ['id'] + value_cols:
        if col not in data.columns:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")
    
    if(len(value_cols) == 1):
        value_map_1 = defaultdict(set)
        for _, row in data.iterrows():
            value_map_1[row['id']].add(row[value_cols[0]])
        return value_map_1
    else:
        value_map_mult = defaultdict(list)
        for _, row in data.iterrows():
            for col in value_cols:
                value_map_mult[row['id']].append(row[col])
        return value_map_mult

In [4]:
# Cleaning raw recipe to remove recipes that take longer than 1 day, have more than 25 steps, and more than 40 ingredients
df_rr_clean = df_raw_recipes[df_raw_recipes['minutes'] <= 1440]
df_rr_clean = df_rr_clean[df_rr_clean['n_ingredients'] <= 25]
df_rr_clean = df_rr_clean[df_rr_clean['n_steps'] <= 40]



In [5]:
# Mapping ids to ingrediants and recipes for future reference
id_to_ingr = convToMap(df_ingr_map, "replaced")
id_to_recipe = convToMap(df_rr_clean, ["name","minutes", "nutrition", "n_steps", "ingredients", "n_ingredients"])

In [6]:
# Dropping date, u, and i columns
df_train_clean = df_train.drop(["date", "u", "i"], axis=1)
df_test_clean = df_test.drop(["date", "u", "i"], axis=1)
df_val_clean = df_val.drop(["date", "u", "i"], axis=1)

# Removing any recipes that were removed from cleaning the raw recipes
df_train_clean = df_train_clean[df_train_clean['recipe_id'].isin(df_rr_clean['id'])]
df_test_clean = df_test_clean[df_test_clean['recipe_id'].isin(df_rr_clean['id'])]
df_val_clean = df_val_clean[df_val_clean['recipe_id'].isin(df_rr_clean['id'])]

In [7]:
recc_train, recc_test = train_test_split(df_made_train, test_size=0.2, shuffle=True)

In [8]:
# Get most popular recipes by the number of times made
recipeCount = defaultdict(int)
totalMade = 0

for _, data in recc_train.iterrows():
    if data['made']:
        recipeCount[data['recipe_id']] += 1
        totalMade += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

mostPopularbyMade = set()
count = 0
for ic, i in mostPopular:
    count += ic
    mostPopularbyMade.add(i)
    if count > totalMade/2: break

In [9]:
recipe_rating_counts = df_train_clean.groupby("recipe_id").size()
eligible_recipes = recipe_rating_counts[recipe_rating_counts >= 6].index
filtered_data = df_train_clean[df_train_clean["recipe_id"].isin(eligible_recipes)]
recipe_avg_ratings = filtered_data.groupby("recipe_id")["rating"].mean()
mostPopularbyRating_df = recipe_avg_ratings.sort_values(ascending=False).head(int(len(recipe_avg_ratings)/2))

In [10]:
mostPopularbyRating = set()
for id in mostPopularbyRating_df.index:
    mostPopularbyRating.add(id)

In [11]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [12]:
correctByMade = 0
correctByRating = 0
for _, data in recc_test.iterrows():
    predMade = 0
    predRating = 0
    r = data['recipe_id']
    if r in mostPopularbyMade:
        predMade = 1
    if r in mostPopularbyRating:
        predRating = 1
    print(str(predMade) + "        " + str(predRating))
    made = int(data['made'])
    if(predMade == made): correctByMade+=1
    if(predRating == made): correctByRating+=1

0        0
0        0
0        0
0        0
1        1
1        1
0        0
0        0
0        0
0        0
1        0
0        0
0        0
0        0
1        1
1        1
1        1
1        0
0        0
0        0
0        0
1        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
1        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
1        0
1        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
1        1
0        1
1        1
1        1
1        1
0        0
0        0
0        0
0        0
0        0
0        0
1        0
0        0
0        0
1        0
0        0
1        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
0        0
1        1
0        0
1        0
0        0
0        0
0        0
0        0
0        0
0        1
0        0
1        1
0        0
0        0
1        0
0        0
0        0
0        0

In [13]:
print("Accuracy using popularity by times made: " + str(correctByMade/len(recc_test)))
print("Accuracy using popularity by rating: " + str(correctByRating/len(recc_test)))

Accuracy using popularity by times made: 0.7332234569322893
Accuracy using popularity by rating: 0.6375842065868264
