## Setup

In [1]:
import os
# give googe drive the required permission
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Create a folder in your drive and add the name of that folder here. 
# For example, for the code below to run correctly, you need to have a folder named FoodRecSys in 'My Drive'.  
# The said folder will be your home directory for the rest of the project. 
# You will be able to save and read data from the folder. 

os.chdir("/content/drive/MyDrive/FoodRecSys/")
os.getcwd()

'/content/drive/MyDrive/FoodRecSys'

In [3]:
try:
  import pyspark 
except:
  !pip install pyspark==3.1.2
  import pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.1.2
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.4/212.4 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 KB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880770 sha256=ae476ffbcfc6a55d9513092494300161ba1c03fb3b3fbbe55b26b1e531caec23
  Stored in directory: /root/.cache/pip/wheels/11/17/0b/53e7d10fe66ca7647d391cdba323fcf5b2f9dfcb7ebad87aa7
Successfully built pyspark
Installing collected packages: py4j, py

In [4]:
try:
  import lenskit 
except:
  %pip install lenskit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lenskit
  Downloading lenskit-0.14.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.0/74.0 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting csr>=0.3.1
  Downloading csr-0.4.3-py3-none-any.whl (23 kB)
Collecting seedbank>=0.1.0
  Downloading seedbank-0.1.2-py3-none-any.whl (7.9 kB)
Collecting binpickle>=0.3.2
  Downloading binpickle-0.3.4-py3-none-any.whl (13 kB)
Collecting anyconfig
  Downloading anyconfig-0.13.0-py2.py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.8/87.8 KB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: binpickle, anyconfig, seedbank, csr, lenskit
Successfully installed anyconfig-0.13.0 binpickle-0.3.4 csr-0.4.3 lenskit-0.14.2 seedbank-0.1.2


In [5]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

In [6]:
spark = SparkSession.builder.master("local").config('spark.ui.port', '4050').getOrCreate()

In [7]:
spark

## Imports

In [8]:
# import necessary libraries 
import pandas as pd
import numpy as np

In [9]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType,StringType, ArrayType

In [10]:
# Import the required functions for ALS and estimating

In [11]:
from lenskit import batch, topn, util
from lenskit.algorithms import Recommender, als, item_knn as knn

## Read the data

To connect the data files with your google collab, download them and upload them to your home folder in your google drive. 

In [14]:
pwd

'/content/drive/MyDrive/FoodRecSys'

In [15]:
train_ratings_df = spark.read.parquet('model_input/train_interaction_level_df.parquet', # Replace the given path with the path for your file
                                      header=True, 
                                      inferSchema=True)

In [18]:
test_ratings_all_df  = spark.read.parquet('model_input/test_interaction_level_df.parquet', # Replace the given path with the path for your file
                                          header=True, 
                                          inferSchema=True)

In [19]:
raw_recipes_df = spark.read.csv("data/RAW_recipes_cleaned.csv", # Replace the given path with the path for your file
                                header=True, 
                                inferSchema=True)

In [21]:
train_ratings_df

DataFrame[user_id: int, recipe_id: int, rating: int, review: string, review_date: string, name: string, id: int, minutes: int, contributor_id: int, submitted: string, tags: string, nutrition: string, n_steps: int, steps: string, description: string, ingredients: string, n_ingredients: int, year_of_review: int]

In [37]:
from pyspark.sql.functions import countDistinct

In [60]:
train_ratings_df.select(countDistinct("user_id")).first()[0]

23362

In [61]:
# Count the total number of ratings in the dataset
numerator = train_ratings_df.select("rating").count()

# Count the number of distinct userIds and distinct recipe_Ids
num_users = int(train_ratings_df.select(countDistinct("user_id")).first()[0]) # find the number of unique users in the training data. The output must be an integer
num_recipes = int(train_ratings_df.select(countDistinct("recipe_id")).first()[0]) # find the number of unique recipes in the training data. The output must be an integer

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_recipes

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The training dataframe is ", "%.7f" % sparsity + "% empty.")

The training dataframe is  99.9829264% empty.


## Functions

In [67]:
def manual_recommendation_check (user_id):
  '''
  Given a user ID form the test dataset, this function will return the names of the recipes recommended to the user. 

  Initialize this function after the all_recs data frame is calculated. 

  Input user_id of a user from the test set as an integer.  
  Prints the names of recipes recommended to this user. 
  Returns nothing. 
  '''
  recs_user = all_recs_als[all_recs_als.user == user_id]
  recs_user_list = list(recs_user.item.values)
  recs_user_list = [x.item() for x in recs_user_list]
  display((raw_recipes_df.filter(F.col('id').isin(recs_user_list))
               .select("name")
               .collect()
               ))

## Model

#### Task 02 - Collaborative Filtering Model

Add the argument details in the algorithm initialization below to build the ALS model.

In [82]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [71]:
# Create ALS model
als = ALS(userCol= "user_id", # add the name of the column for users  
          itemCol= "recipe_id", # add the name of the column for recipes
          ratingCol=  "rating", # add the name of the column for ratings 
          nonnegative = True, 
          implicitPrefs = False, 
          coldStartStrategy="drop"
         )

In [72]:
assert type(als) == pyspark.ml.recommendation.ALS

#### Training

In [73]:
#Fit the model to the 'train' dataset
model = als.fit(train_ratings_df)

## Prediction 

In [74]:
# use the model to create predictions for test data
test_predictions_unseen =  model.transform(test_ratings_all_df)# add a statment to tranform the test data

In [75]:
assert test_predictions_unseen.select(F.col("prediction"))

In [76]:
test_predictions_unseen.select(F.col("user_id"),
                               F.col("recipe_id"),
                               F.col("rating"),
                               F.col("prediction")
                              ).show(5)

+-------+---------+------+----------+
|user_id|recipe_id|rating|prediction|
+-------+---------+------+----------+
| 199020|    55265|     5| 4.1712193|
| 369284|    76143|     5| 3.7527776|
| 224235|    89385|     5| 4.3201175|
| 385423|    95476|     4| 3.9324775|
| 538098|    95476|     5|  4.516135|
+-------+---------+------+----------+
only showing top 5 rows



In [77]:
# Each user in the test set must have 10 predictions. 
# Use ALS model to get these predictions. 
# You can use the recommendForAllUsers() method. 

recommendations = model.recommendForAllUsers(10) # complete the code 

In [78]:
assert len(dict(recommendations.select(F.col("recommendations")).collect()[0][0])) == 10

In [79]:
# Transfroming the results by exploding the recommendations column

recommendations = (recommendations.select(F.col("user_id"),
                                          F.posexplode(F.col("recommendations")).alias("pos", "item")) 
                                  .select(F.col("user_id"),
                                          F.col("pos"), 
                                          F.col("item.recipe_id").alias("recomended_recipe_id"), 
                                          F.col("item.rating").alias("predicted_rating")))

## Evaluation

### Task 04 - Model Evaluation

#### 1. RMSE

Add the argument details to the evaluator function below to calculate the RMSE score of the ALS model. 

In [86]:
# Define evaluator as RMSE and print RMSE value
evaluator_seen = RegressionEvaluator(metricName="rmse", 
                                     labelCol= "rating",  # add the name of the column that has the ratings 
                                     predictionCol= "prediction" # add the name of the column that has the predicted ratings. 
                                     ) 

In [89]:
RMSE = evaluator_seen.evaluate(test_predictions_unseen) # Use the evaluator to find the RMSE on the test set. 
print(RMSE)

1.4123465536989848


#### Rank based Metrics

We will use the lenskit library to calculate the ranking-based matrics. The lenskit library is available in pandas only so we need to convert the data frames from PySpark dataframes to Pandas dataframes. 

In [110]:
all_recs_als = recommendations.toPandas()

In [111]:
all_recs_als

Unnamed: 0,user_id,pos,recomended_recipe_id,predicted_rating
0,28170,0,108417,5.878755
1,28170,1,65833,5.864337
2,28170,2,146652,5.854054
3,28170,3,294131,5.741923
4,28170,4,280255,5.731477
...,...,...,...,...
233615,1904821,5,9410,0.000000
233616,1904821,6,9970,0.000000
233617,1904821,7,10150,0.000000
233618,1904821,8,11440,0.000000


In [112]:
# Rename the columns of to eunsre that they match the columns names as in the cell below. 

column_names = {
  "user_id": "user",
  "recomended_recipe_id": "item",
  "pos": "rank",
  "predicted_rating": "score"

}
# create a dictionary with current column names as keys and the intended column names as values. 
all_recs_als = all_recs_als.rename(columns=column_names)

In [113]:
all_recs_als

Unnamed: 0,user,rank,item,score
0,28170,0,108417,5.878755
1,28170,1,65833,5.864337
2,28170,2,146652,5.854054
3,28170,3,294131,5.741923
4,28170,4,280255,5.731477
...,...,...,...,...
233615,1904821,5,9410,0.000000
233616,1904821,6,9970,0.000000
233617,1904821,7,10150,0.000000
233618,1904821,8,11440,0.000000


In [114]:
# Adding a column to make sure the we know these recommendations are from the ALS algorithm. 

all_recs_als["algorithm"] = "ALS"

In [115]:
from pyspark.sql.functions import col


In [120]:
# Convert the test dataset to pandas and ensure that it has the same column names as shown in the cell below. 
# Also, note there are only three columns in the rest data. You have to ensure your test data looks identical. 

test_data = all_recs_als[['user', 'item', 'score']].copy()
test_data.rename(columns={'score': 'rating'}, inplace=True)
test_data['rating'] = test_data['rating'].astype(int)


In [121]:
type(all_recs_als)

pandas.core.frame.DataFrame

In [122]:
test_data

Unnamed: 0,user,item,rating
0,28170,108417,5
1,28170,65833,5
2,28170,146652,5
3,28170,294131,5
4,28170,280255,5
...,...,...,...
233615,1904821,9410,0
233616,1904821,9970,0
233617,1904821,10150,0
233618,1904821,11440,0


In [123]:
# code to calculate the necessary metrics 
# the code below uses lenskit library to evaluate the rank metrics

rla = topn.RecListAnalysis()
rla.add_metric(topn.recip_rank)
rla.add_metric(topn.ndcg)
rla.add_metric(topn.dcg)
results = rla.compute(all_recs_als, test_data)
results.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,28170,10,inf,1.133023,26.272473
ALS,56680,10,inf,1.136226,29.834079
ALS,108460,10,inf,1.133023,26.272473
ALS,139830,10,inf,1.133023,26.272473
ALS,198430,10,inf,1.128146,27.272473


## Manual Prediction Checking

1. Why are ndcg and dcg nulls? EG: user 28170

In [124]:
test_data[test_data.user == 28170]

Unnamed: 0,user,item,rating
0,28170,108417,5
1,28170,65833,5
2,28170,146652,5
3,28170,294131,5
4,28170,280255,5
5,28170,304159,5
6,28170,360099,5
7,28170,148469,5
8,28170,156550,5
9,28170,313100,5


In [125]:
all_recs_als[all_recs_als.user == 28170]

Unnamed: 0,user,rank,item,score,algorithm
0,28170,0,108417,5.878755,ALS
1,28170,1,65833,5.864337,ALS
2,28170,2,146652,5.854054,ALS
3,28170,3,294131,5.741923,ALS
4,28170,4,280255,5.731477,ALS
5,28170,5,304159,5.730266,ALS
6,28170,6,360099,5.723488,ALS
7,28170,7,148469,5.681103,ALS
8,28170,8,156550,5.680753,ALS
9,28170,9,313100,5.609076,ALS


User ```28170``` does not appear in the test set. Hence cannot be evaluated. 

2. Why are all metrics 0 for specific users?

In [126]:
test_data[test_data.user == 56680]

Unnamed: 0,user,item,rating
10,56680,335857,6
11,56680,59874,6
12,56680,280520,6
13,56680,305924,6
14,56680,31177,6
15,56680,268958,5
16,56680,233006,5
17,56680,212911,5
18,56680,269790,5
19,56680,34219,5


In [127]:
all_recs_als[all_recs_als.user == 56680]

Unnamed: 0,user,rank,item,score,algorithm
10,56680,0,335857,6.454454,ALS
11,56680,1,59874,6.34693,ALS
12,56680,2,280520,6.111232,ALS
13,56680,3,305924,6.061272,ALS
14,56680,4,31177,6.024142,ALS
15,56680,5,268958,5.899051,ALS
16,56680,6,233006,5.890579,ALS
17,56680,7,212911,5.887768,ALS
18,56680,8,269790,5.850905,ALS
19,56680,9,34219,5.847662,ALS


User-recipe combination does not appear in the recommendations set. 

3. Are any non zero metrics? 

In [128]:
results[results.ndcg > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,28170,10,inf,1.133023,26.272473
ALS,56680,10,inf,1.136226,29.834079
ALS,108460,10,inf,1.133023,26.272473
ALS,139830,10,inf,1.133023,26.272473
ALS,198430,10,inf,1.128146,27.272473
ALS,...,...,...,...,...
ALS,1416484,10,inf,1.133023,26.272473
ALS,1440492,10,inf,1.133023,26.272473
ALS,1504866,10,inf,1.133023,21.017978
ALS,1634437,10,inf,1.133023,26.272473


In [129]:
results[results.dcg > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,28170,10,inf,1.133023,26.272473
ALS,56680,10,inf,1.136226,29.834079
ALS,108460,10,inf,1.133023,26.272473
ALS,139830,10,inf,1.133023,26.272473
ALS,198430,10,inf,1.128146,27.272473
ALS,...,...,...,...,...
ALS,1416484,10,inf,1.133023,26.272473
ALS,1440492,10,inf,1.133023,26.272473
ALS,1504866,10,inf,1.133023,21.017978
ALS,1634437,10,inf,1.133023,26.272473


In [130]:
results[results.recip_rank > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,28170,10,inf,1.133023,26.272473
ALS,56680,10,inf,1.136226,29.834079
ALS,108460,10,inf,1.133023,26.272473
ALS,139830,10,inf,1.133023,26.272473
ALS,198430,10,inf,1.128146,27.272473
ALS,...,...,...,...,...
ALS,1504866,10,inf,1.133023,21.017978
ALS,1634437,10,inf,1.133023,26.272473
ALS,1748511,10,inf,,0.000000
ALS,1763282,10,inf,1.123613,28.272473


There are few user recipe combination that has a corresponding value in the test set. Hence, we have only few non-zero value of evaluation metrics.


Manually, check one prediction. 

In [132]:
# use the function diclared earlier to print the recomendations for the user 653438
all_recs_als[all_recs_als.user == 653438]

Unnamed: 0,user,rank,item,score,algorithm
8860,653438,0,342007,6.362781,ALS
8861,653438,1,400672,6.310959,ALS
8862,653438,2,428778,6.23414,ALS
8863,653438,3,294131,6.196271,ALS
8864,653438,4,379370,6.076671,ALS
8865,653438,5,146652,6.071104,ALS
8866,653438,6,260236,6.06459,ALS
8867,653438,7,224443,6.044536,ALS
8868,653438,8,37841,6.016934,ALS
8869,653438,9,288725,5.998799,ALS


Of the recipes that have been recommended, few appear similar, and few do not. 

## Saving the models and predictions

In [135]:
all_recs_als.to_csv("model_output/ALS/recommendation_als.csv", # modify the path 
                    index=False)

In [137]:
model.save('model_output/ALS/ALS_model.model') # modify the path

Py4JJavaError: ignored