In [2]:
import pandas as pd
import numpy as np

In [3]:
recipe_df = pd.read_csv("data/Foodcom/reviews.csv")
recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   ReviewId       1401982 non-null  int64 
 1   RecipeId       1401982 non-null  int64 
 2   AuthorId       1401982 non-null  int64 
 3   AuthorName     1401982 non-null  object
 4   Rating         1401982 non-null  int64 
 5   Review         1401768 non-null  object
 6   DateSubmitted  1401982 non-null  object
 7   DateModified   1401982 non-null  object
dtypes: int64(4), object(4)
memory usage: 85.6+ MB


In [4]:
recipe_df = pd.read_csv("data/Foodcom/recipes.csv", usecols=["RecipeId", "Name"], dtype={"RecipeId": "int32", "Name": "str"})
rating_df = pd.read_csv("data/Foodcom/reviews.csv", usecols=["AuthorId", "RecipeId", "Rating"], dtype={"RecipeId": "int32", "Rating": "float32"})


In [5]:
recipe_df.head()

Unnamed: 0,RecipeId,Name
0,38,Low-Fat Berry Blue Frozen Dessert
1,39,Biryani
2,40,Best Lemonade
3,41,Carina's Tofu-Vegetable Kebabs
4,42,Cabbage Soup


In [6]:
rating_df.head()

Unnamed: 0,RecipeId,AuthorId,Rating
0,992,2008,5.0
1,4384,1634,4.0
2,4523,2046,2.0
3,7435,1773,5.0
4,44,2085,5.0


In [7]:
rating_df.shape

(1401982, 3)

In [8]:
counts1 = rating_df["AuthorId"].value_counts()
rating_df = rating_df[rating_df["AuthorId"].isin(counts1[counts1 >= 20].index)]
counts = rating_df["RecipeId"].value_counts()
rating_df = rating_df[rating_df["RecipeId"].isin(counts[counts >= 100].index)]

In [9]:
rating_df.shape

(72320, 3)

In [10]:
df = pd.merge(rating_df, recipe_df, on="RecipeId")
df

Unnamed: 0,RecipeId,AuthorId,Rating,Name
0,2886,2312,5.0,Best Banana Bread
1,2886,9869,5.0,Best Banana Bread
2,2886,25455,4.0,Best Banana Bread
3,2886,25792,5.0,Best Banana Bread
4,2886,28397,5.0,Best Banana Bread
...,...,...,...,...
72315,420398,217634,0.0,Ice Cubes
72316,420398,2156777,5.0,Ice Cubes
72317,420398,527754,5.0,Ice Cubes
72318,420398,1581225,5.0,Ice Cubes


In [11]:
combinedRecipeRating = df.dropna(axis=0, subset=["Name"])
recipeRatingCount = (combinedRecipeRating.
                       groupby(by=["Name"])["Rating"].
                       count().
                       reset_index().
                       rename(columns={"Rating": "totalRatingCount"})
                       [["Name", "totalRatingCount"]]
)
recipeRatingCount.head()

Unnamed: 0,Name,totalRatingCount
0,&quot;Whatever Floats Your Boat&quot; Brownies!,613
1,1 Pan Fudge Cake,104
2,24k Carrots,152
3,4 Minute Spicy Garlic Shrimp,195
4,Absolute Best Ever Lasagna,219


In [12]:
combinedRecipeRating.sort_values(by="RecipeId").head(115)

Unnamed: 0,RecipeId,AuthorId,Rating,Name
48434,76,318235,4.0,Alfredo Sauce
48445,76,724516,5.0,Alfredo Sauce
48432,76,98994,5.0,Alfredo Sauce
48431,76,258877,5.0,Alfredo Sauce
48430,76,37584,5.0,Alfredo Sauce
...,...,...,...,...
48388,76,364433,5.0,Alfredo Sauce
48389,76,414969,5.0,Alfredo Sauce
48390,76,29782,5.0,Alfredo Sauce
48386,76,340576,5.0,Alfredo Sauce


In [13]:
df = pd.DataFrame(combinedRecipeRating)

# Remove the AuthorId column
df = df.drop(columns=['AuthorId'])

# Group by RecipeId and Name, then calculate the mean rating and count of ratings
result = df.groupby(['RecipeId', 'Name']).agg(
    Rating=('Rating', 'mean'),
    NumberRating=('Rating', 'count')
).reset_index()
result

Unnamed: 0,RecipeId,Name,Rating,NumberRating
0,76,Alfredo Sauce,4.587719,114
1,1209,Spinach Artichoke Dip,4.662791,258
2,2496,Dark Chocolate Cake,4.720833,240
3,2642,Taco Seasoning Mix,4.739130,207
4,2886,Best Banana Bread,4.652406,187
...,...,...,...,...
402,221743,Kittencal's Best Blasted Rapid-Roast Whole Chi...,4.802198,182
403,222188,Vanilla Buttercream Frosting (From Sprinkles ...,4.639706,136
404,251220,Kittencal's Fluffiest Scrambled Eggs,4.850000,120
405,261889,Kittencal's Buttery Cut-Out Sugar Cookies W/ I...,4.544910,167


In [14]:
ratingWithTotalRatingCount = combinedRecipeRating.merge(recipeRatingCount, left_on="Name", right_on="Name", how="left")
# ratingWithTotalRatingCount = ratingWithTotalRatingCount.drop(columns="AuthorId")
ratingWithTotalRatingCount.head()

Unnamed: 0,RecipeId,AuthorId,Rating,Name,totalRatingCount
0,2886,2312,5.0,Best Banana Bread,187
1,2886,9869,5.0,Best Banana Bread,187
2,2886,25455,4.0,Best Banana Bread,187
3,2886,25792,5.0,Best Banana Bread,187
4,2886,28397,5.0,Best Banana Bread,187


In [15]:
ratingWithTotalRatingCount.sort_values(by="RecipeId", ascending=False)

Unnamed: 0,RecipeId,AuthorId,Rating,Name,totalRatingCount
72319,420398,2001604211,0.0,Ice Cubes,140
72223,420398,325290,5.0,Ice Cubes,140
72229,420398,193709,5.0,Ice Cubes,140
72228,420398,1269562,5.0,Ice Cubes,140
72227,420398,1230562,5.0,Ice Cubes,140
...,...,...,...,...,...
48433,76,383346,5.0,Alfredo Sauce,114
48434,76,318235,4.0,Alfredo Sauce,114
48435,76,407007,5.0,Alfredo Sauce,114
48436,76,394617,5.0,Alfredo Sauce,114


In [16]:
pd.set_option("display.float_format", lambda x: "%.3f" % x)
print(recipeRatingCount["totalRatingCount"].describe())

count    407.000
mean     177.690
std      107.240
min      100.000
25%      116.000
50%      146.000
75%      195.500
max     1165.000
Name: totalRatingCount, dtype: float64


In [17]:
ratingWithTotalRatingCount.shape

(72320, 5)

In [18]:
recipeFeatures = ratingWithTotalRatingCount.pivot_table(index="Name", columns="AuthorId", values="Rating").fillna(0)
recipeFeatures.head()

AuthorId,1533,1535,1676,1792,1891,1962,2310,2312,2586,2695,...,2001604211,2001625595,2002093000,2002169932,2002256447,2002273175,2002312797,2002321540,2002404048,2002754832
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Whatever Floats Your Boat&quot; Brownies!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1 Pan Fudge Cake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24k Carrots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Minute Spicy Garlic Shrimp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
Absolute Best Ever Lasagna,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
recipeFeatures.shape

(407, 7596)

In [20]:
from scipy.sparse import csr_matrix
recipeFeaturesMatrix = csr_matrix(recipeFeatures.values)

from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(recipeFeaturesMatrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [21]:
recipeFeatures.shape

(407, 7596)

In [22]:
queryIndex = np.random.choice(recipeFeatures.shape[0])
print(queryIndex)
distances, indices = model.kneighbors(recipeFeatures.iloc[queryIndex, :].values.reshape(1, -1), n_neighbors=6)

240


In [23]:
recipeFeatures.head()

AuthorId,1533,1535,1676,1792,1891,1962,2310,2312,2586,2695,...,2001604211,2001625595,2002093000,2002169932,2002256447,2002273175,2002312797,2002321540,2002404048,2002754832
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Whatever Floats Your Boat&quot; Brownies!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1 Pan Fudge Cake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24k Carrots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Minute Spicy Garlic Shrimp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
Absolute Best Ever Lasagna,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
recipeFeatures.index[queryIndex]

'Mahogany Beef Stew'

In [25]:
for i in range(0, len(distances.flatten())):
    if i==0:
        print(f"Recommendation for {format(recipeFeatures.index[queryIndex])}:\n")
    else:
        print(f"{i}: {recipeFeatures.index[indices.flatten()[i]]}, with distance of {distances.flatten()[i]}:")

Recommendation for Mahogany Beef Stew:

1: Jo Mama's World Famous Spaghetti, with distance of 0.8625703454017639:
2: Creamy Cajun Chicken Pasta, with distance of 0.865392804145813:
3: Mean Guy's General Tso's Chicken, with distance of 0.8783514499664307:
4: Japanese Mum's Chicken, with distance of 0.8795065879821777:
5: Beef Tips, with distance of 0.8870410323143005:


In [26]:
recipe_names = recipeFeatures.index


In [27]:
result

Unnamed: 0,RecipeId,Name,Rating,NumberRating
0,76,Alfredo Sauce,4.588,114
1,1209,Spinach Artichoke Dip,4.663,258
2,2496,Dark Chocolate Cake,4.721,240
3,2642,Taco Seasoning Mix,4.739,207
4,2886,Best Banana Bread,4.652,187
...,...,...,...,...
402,221743,Kittencal's Best Blasted Rapid-Roast Whole Chi...,4.802,182
403,222188,Vanilla Buttercream Frosting (From Sprinkles ...,4.640,136
404,251220,Kittencal's Fluffiest Scrambled Eggs,4.850,120
405,261889,Kittencal's Buttery Cut-Out Sugar Cookies W/ I...,4.545,167


In [28]:
df = pd.read_csv("data/Foodcom/recipes.csv")
final_df = pd.merge(result, df.drop(columns=['Name']), on='RecipeId', how='left')

final_df

Unnamed: 0,RecipeId,Name,Rating,NumberRating,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,76,Alfredo Sauce,4.588,114,1535,Marg CaymanDesigns,PT10M,PT5M,PT15M,1999-09-15T04:26:00Z,...,31.500,159.000,312.400,2.500,0.000,0.200,8.700,4.000,,"c(""Place butter in microwave safe pot and heat..."
1,1209,Spinach Artichoke Dip,4.663,258,1755,Juli9251,,PT30M,PT30M,1999-09-22T22:26:00Z,...,6.500,34.100,291.600,5.500,2.800,1.400,7.700,15.000,4 1/2 cups,"c(""Preheat oven to 375°F."", ""Mix together Parm..."
2,2496,Dark Chocolate Cake,4.721,240,1535,Marg CaymanDesigns,PT50M,PT10M,PT1H,1999-10-23T12:27:00Z,...,1.400,20.300,252.400,30.900,1.400,20.100,2.800,,,"c(""Heat oven to 350°F."", ""Grease and flour two..."
3,2642,Taco Seasoning Mix,4.739,207,1539,opal Fitzgerald,,PT5M,PT5M,1999-08-15T07:45:00Z,...,0.100,0.000,2372.500,6.000,1.500,1.500,0.900,,1 package,"c(""Combine all ingredients in a small bowl and..."
4,2886,Best Banana Bread,4.652,187,1762,lkadlec,PT1H,PT10M,PT1H10M,1999-09-26T20:49:00Z,...,6.200,61.600,338.300,42.500,1.400,24.400,3.700,10.000,1 loaf,"c(""Remove odd pots and pans from oven."", ""Preh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,221743,Kittencal's Best Blasted Rapid-Roast Whole Chi...,4.802,182,89831,Kittencalrecipezazz,PT1H,PT20M,PT1H20M,2007-04-09T22:06:00Z,...,17.200,243.800,294.500,2.600,0.600,0.500,57.600,,,"c(""Rinse the chicken inside and out well under..."
403,222188,Vanilla Buttercream Frosting (From Sprinkles ...,4.640,136,454328,C. Taylor,,PT10M,PT10M,2007-04-12T20:30:00Z,...,9.700,40.700,27.200,35.000,0.000,34.300,0.200,,12 cupcakes,"c(""In a bowl add sugar then add butter. Beat t..."
404,251220,Kittencal's Fluffiest Scrambled Eggs,4.850,120,89831,Kittencalrecipezazz,,PT5M,PT5M,2007-09-05T21:04:00Z,...,5.900,384.200,496.300,1.500,0.000,0.400,13.100,2.000,,"c(""In a bowl whisk the eggs vigorously with mi..."
405,261889,Kittencal's Buttery Cut-Out Sugar Cookies W/ I...,4.545,167,89831,Kittencalrecipezazz,PT4M,PT2H,PT2H4M,2007-10-28T19:05:00Z,...,3.500,24.600,107.800,22.000,0.300,12.600,1.600,,,"c(""For cookies; in a large bowl combine butter..."


In [29]:
final_df.sort_values(by="RecipeId")

Unnamed: 0,RecipeId,Name,Rating,NumberRating,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,76,Alfredo Sauce,4.588,114,1535,Marg CaymanDesigns,PT10M,PT5M,PT15M,1999-09-15T04:26:00Z,...,31.500,159.000,312.400,2.500,0.000,0.200,8.700,4.000,,"c(""Place butter in microwave safe pot and heat..."
1,1209,Spinach Artichoke Dip,4.663,258,1755,Juli9251,,PT30M,PT30M,1999-09-22T22:26:00Z,...,6.500,34.100,291.600,5.500,2.800,1.400,7.700,15.000,4 1/2 cups,"c(""Preheat oven to 375°F."", ""Mix together Parm..."
2,2496,Dark Chocolate Cake,4.721,240,1535,Marg CaymanDesigns,PT50M,PT10M,PT1H,1999-10-23T12:27:00Z,...,1.400,20.300,252.400,30.900,1.400,20.100,2.800,,,"c(""Heat oven to 350°F."", ""Grease and flour two..."
3,2642,Taco Seasoning Mix,4.739,207,1539,opal Fitzgerald,,PT5M,PT5M,1999-08-15T07:45:00Z,...,0.100,0.000,2372.500,6.000,1.500,1.500,0.900,,1 package,"c(""Combine all ingredients in a small bowl and..."
4,2886,Best Banana Bread,4.652,187,1762,lkadlec,PT1H,PT10M,PT1H10M,1999-09-26T20:49:00Z,...,6.200,61.600,338.300,42.500,1.400,24.400,3.700,10.000,1 loaf,"c(""Remove odd pots and pans from oven."", ""Preh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,221743,Kittencal's Best Blasted Rapid-Roast Whole Chi...,4.802,182,89831,Kittencalrecipezazz,PT1H,PT20M,PT1H20M,2007-04-09T22:06:00Z,...,17.200,243.800,294.500,2.600,0.600,0.500,57.600,,,"c(""Rinse the chicken inside and out well under..."
403,222188,Vanilla Buttercream Frosting (From Sprinkles ...,4.640,136,454328,C. Taylor,,PT10M,PT10M,2007-04-12T20:30:00Z,...,9.700,40.700,27.200,35.000,0.000,34.300,0.200,,12 cupcakes,"c(""In a bowl add sugar then add butter. Beat t..."
404,251220,Kittencal's Fluffiest Scrambled Eggs,4.850,120,89831,Kittencalrecipezazz,,PT5M,PT5M,2007-09-05T21:04:00Z,...,5.900,384.200,496.300,1.500,0.000,0.400,13.100,2.000,,"c(""In a bowl whisk the eggs vigorously with mi..."
405,261889,Kittencal's Buttery Cut-Out Sugar Cookies W/ I...,4.545,167,89831,Kittencalrecipezazz,PT4M,PT2H,PT2H4M,2007-10-28T19:05:00Z,...,3.500,24.600,107.800,22.000,0.300,12.600,1.600,,,"c(""For cookies; in a large bowl combine butter..."


In [35]:
final_df.shape

(407, 30)

In [34]:
def recommend_recipe(name):
    id = np.where(recipeFeatures.index==name)[0][0]
    distances, indices = model.kneighbors(recipeFeatures.iloc[id, :].values.reshape(1, -1), n_neighbors=6)
    # recipe_name = []
    ids_index = []
    poster_url = []

    # for recipe_id in indices:
    #     recipe_name.append(recipeFeatures.index[recipe_id])

    # for name in recipe_name[0]: 
    #     ids = np.where(final_df['Name'] == name)[0][0]
    #     ids_index.append(ids)
    # for idx in ids_index:
    #     url = final_df.iloc[idx]['Images']
    #     urls = url.strip('c()').replace('"', '').split(', ')
    #     print(urls[0])
    #     print(1)
    # poster_url.append(urls[0])
        # for url in urls:
        #     print(url)
    # for idx in ids_index:
    #     url = final_df.iloc[idx]['Images']
    #     poster_url.append(url)
    for i in range(len(indices)):
        recipes = recipeFeatures.index[indices[i]]
        for j in recipes:
            print(j)
    # print(recipe_id)
    # print(ids_index)
    # print(poster_url)
        # if i==0:
        #     print(f"Recommendation for {format(recipeFeatures.index[id])}:\n")
        # else:
        #     print(f"{i}: {recipeFeatures.index[indices.flatten()[i]]}, with distance of {distances.flatten()[i]}:")

    
name = "Spinach Artichoke Dip"
recommend_recipe(name)

Spinach Artichoke Dip
Creamy Cajun Chicken Pasta
Bourbon Chicken
Crock-Pot Chicken With Black Beans &amp; Cream Cheese
Mexican Rice
Chicken Parmesan


In [31]:
recipe_names[0]

'&quot;Whatever Floats Your Boat&quot; Brownies!'

In [32]:
ids_index = []
for name in recipe_names[0]: 
    ids = np.where(final_df['Name'] == name)[0]
    ids_index.append(ids)
final_df.columns


Index(['RecipeId', 'Name', 'Rating', 'NumberRating', 'AuthorId', 'AuthorName',
       'CookTime', 'PrepTime', 'TotalTime', 'DatePublished', 'Description',
       'Images', 'RecipeCategory', 'Keywords', 'RecipeIngredientQuantities',
       'RecipeIngredientParts', 'AggregatedRating', 'ReviewCount', 'Calories',
       'FatContent', 'SaturatedFatContent', 'CholesterolContent',
       'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent',
       'ProteinContent', 'RecipeServings', 'RecipeYield',
       'RecipeInstructions'],
      dtype='object')

In [33]:
import pickle
pickle.dump(model, open("model/model1.pickle", "wb"))
pickle.dump(recipe_names, open("model/recipe_names.pickle", "wb"))
pickle.dump(final_df, open("model/final_df.pickle", "wb"))
pickle.dump(recipeFeatures, open("model/pivot.pickle", "wb"))
