## Setup

In [None]:
import os
# give googe drive the required permission
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir("/content/drive/MyDrive/food_recommender_assignment/")
os.getcwd()

'/content/drive/MyDrive/food_recommender_assignment'

In [None]:
try:
  import pyspark 
except:
  !pip install pyspark==3.1.2
  import pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.1.2
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 62 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 20.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880769 sha256=8d6f679eebb81b3a3c2daeb009ec6c3e28af52d2fd374959c56136b0a46ba648
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [None]:
try:
  import lenskit 
except:
  %pip install lenskit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lenskit
  Downloading lenskit-0.14.2-py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 1.8 MB/s 
[?25hCollecting csr>=0.3.1
  Downloading csr-0.4.3-py3-none-any.whl (23 kB)
Collecting binpickle>=0.3.2
  Downloading binpickle-0.3.4-py3-none-any.whl (13 kB)
Collecting seedbank>=0.1.0
  Downloading seedbank-0.1.2-py3-none-any.whl (7.9 kB)
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 10.5 MB/s 
Collecting anyconfig
  Downloading anyconfig-0.13.0-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 6.8 MB/s 
Installing collected packages: pickle5, anyconfig, seedbank, csr, binpickle, lenskit
Successfully installed anyconfig-0.13.0 binpickle-0.3.4 csr-0.4.3 lenskit-0.14.2 pickle5-0.0.12 seedbank-0.1.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

In [None]:
spark = SparkSession.builder.master("local").config('spark.ui.port', '4050').getOrCreate()

In [None]:
spark

## Imports

In [None]:
# import necessary libraries 
import pandas as pd
import numpy as np

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType,StringType, ArrayType

In [None]:
# Import the required functions for ALS and estimating
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [None]:
from lenskit import batch, topn, util
from lenskit.algorithms import Recommender, als, item_knn as knn

## Read the data

To connect the data files with your google collab, download them and upload them to your home folder in your google drive. 

In [None]:
train_ratings_df = spark.read.parquet('Dataset/train_interaction_level_df.parquet', 
                                      header=True, 
                                      inferSchema=True)

In [None]:
test_ratings_all_df  = spark.read.parquet('Dataset/test_interaction_level_df.parquet', 
                                          header=True, 
                                          inferSchema=True)

In [None]:
raw_recipes_df = spark.read.csv("RAW_recipes_cleaned.csv", 
                                header=True, 
                                inferSchema=True)

In [None]:
train_ratings_df.select(F.countDistinct("user_id")).collect()[0][0]

23362

In [None]:
train_ratings_df.select("rating").count()

74673

In [None]:
numerator = train_ratings_df.select("rating").count()


num_users = train_ratings_df.select(F.countDistinct("user_id")).collect()[0][0]
num_recipes = train_ratings_df.select(F.countDistinct("recipe_id")).collect()[0][0]

denominator = num_users * num_recipes

sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The training dataframe is ", "%.7f" % sparsity + "% empty.")

The training dataframe is  99.9829264% empty.


## Functions

In [None]:
def manual_recommendation_check (user_id):
  '''
  Given a user ID form the test dataset, this function will return the names of the recipes recommended to the user. 

  Initialize this function after the all_recs data frame is calculated. 

  Input user_id of a user from the test set as an integer.  
  Prints the names of recipes recommended to this user. 
  Returns nothing. 
  '''
  recs_user = all_recs_als[all_recs_als.user == user_id]
  recs_user_list = list(recs_user.item.values)
  recs_user_list = [x.item() for x in recs_user_list]
  display((raw_recipes_df.filter(F.col('id').isin(recs_user_list))
               .select("name")
               .collect()
               ))

## Model

#### Task 02 - Collaborative Filtering Model

Add the argument details in the algorithm initialization below to build the ALS model.

In [None]:
# Create ALS model
als = ALS(userCol= 'user_id', #  name of the column for users  
          itemCol= 'recipe_id', # name of the column for recipes
          ratingCol= 'rating'  , # name of the column for ratings 
          nonnegative = True, 
          implicitPrefs = False, 
          coldStartStrategy="drop"
         )

In [None]:
assert type(als) == pyspark.ml.recommendation.ALS

#### Training

In [None]:
#Fit the model to the 'train' dataset
model = als.fit(train_ratings_df)

## Prediction 

In [None]:
test_ratings_all_df.count()

18684

In [None]:
# use the model to create predictions for test data
test_predictions_unseen = model.transform(test_ratings_all_df) 

In [None]:
test_predictions_unseen.count()

5854

In [None]:
test_predictions_unseen.select(F.col("user_id"),
                               F.col("recipe_id"),
                               F.col("rating"),
                               F.col("prediction")
                              ).show(5)

+-------+---------+------+----------+
|user_id|recipe_id|rating|prediction|
+-------+---------+------+----------+
| 199020|    55265|     5| 4.5994487|
| 369284|    76143|     5| 3.7438645|
| 224235|    89385|     5| 4.2358184|
| 385423|    95476|     4|  3.671374|
| 538098|    95476|     5|  4.772101|
+-------+---------+------+----------+
only showing top 5 rows



In [None]:
# Each user in the test set must have 10 predictions. 
# Use ALS model to get these predictions. 
# You can use the recommendForAllUsers() method. 

recommendations = model.recommendForAllUsers(10) 

In [None]:
# Transfroming the results by exploding the recommendations column

recommendations = (recommendations.select(F.col("user_id"),
                                          F.posexplode(F.col("recommendations")).alias("pos", "item")) 
                                  .select(F.col("user_id"),
                                          F.col("pos"), 
                                          F.col("item.recipe_id").alias("recomended_recipe_id"), 
                                          F.col("item.rating").alias("predicted_rating")))

## Evaluation

### Task 04 - Model Evaluation

#### 1. RMSE

Add the argument details to the evaluator function below to calculate the RMSE score of the ALS model. 

In [None]:
# Define evaluator as RMSE and print RMSE value
evaluator_seen = RegressionEvaluator(metricName="rmse", 
                                     labelCol= 'rating',  # name of the column that has the ratings 
                                     predictionCol= 'prediction'# name of the column that has the predicted ratings. 
                                     ) 

In [None]:
RMSE = evaluator_seen.evaluate(test_predictions_unseen ) #  evaluator to find the RMSE on the test set. 
print(RMSE)

1.4093766071748897


In [None]:
RMSE = evaluator_seen.evaluate( ) #  evaluator to find the RMSE on the test set. 
print(RMSE)

1.4196906625188663


#### Rank based Metrics

We will use the lenskit library to calculate the ranking-based matrics. The lenskit library is available in pandas only so we need to convert the data frames from PySpark dataframes to Pandas dataframes. 

In [None]:
all_recs_als = recommendations.toPandas()

In [None]:
all_recs_als

Unnamed: 0,user_id,pos,recomended_recipe_id,predicted_rating
0,28170,0,166172,5.912496
1,28170,1,128716,5.843215
2,28170,2,280255,5.796579
3,28170,3,243882,5.796256
4,28170,4,225101,5.791911
...,...,...,...,...
233615,1904821,5,9410,0.000000
233616,1904821,6,9970,0.000000
233617,1904821,7,10150,0.000000
233618,1904821,8,11440,0.000000


In [None]:

column_names = {'user_id':'user',	'pos':'rank',	'recomended_recipe_id':'item',	
                'predicted_rating':'score'} # a dictionary with current column names as keys and the intended column names as values. 
all_recs_als = all_recs_als.rename(columns=column_names)

In [None]:
all_recs_als

Unnamed: 0,user,rank,item,score
0,28170,0,166172,5.912496
1,28170,1,128716,5.843215
2,28170,2,280255,5.796579
3,28170,3,243882,5.796256
4,28170,4,225101,5.791911
...,...,...,...,...
233615,1904821,5,9410,0.000000
233616,1904821,6,9970,0.000000
233617,1904821,7,10150,0.000000
233618,1904821,8,11440,0.000000


In [None]:
# Adding a column to make sure the we know these recommendations are from the ALS algorithm. 

all_recs_als["algorithm"] = "ALS"

In [None]:
test_ratings_all_df.select(F.col('user_id').alias('user'),
                               F.col('recipe_id').alias('item'),
                               F.col('rating')).show(2)

+----------+-----+------+
|      user| item|rating|
+----------+-----+------+
|1802380878|35912|     0|
|2001602879|40335|     5|
+----------+-----+------+
only showing top 2 rows



In [None]:
test_data = (test_ratings_all_df.select(F.col('user_id').alias('user'),
                               F.col('recipe_id').alias('item'),
                               F.col('rating')).toPandas())

In [None]:
test_data

Unnamed: 0,user,item,rating
0,1802380878,35912,0
1,2001602879,40335,5
2,2758877,50348,0
3,199020,55265,5
4,369284,76143,5
...,...,...,...
18679,855082,438292,5
18680,1553277,447699,0
18681,2000072578,447699,5
18682,2775141,469503,5


In [None]:
# code to calculate the necessary metrics 
# the code below uses lenskit library to evaluate the rank metrics

rla = topn.RecListAnalysis()
rla.add_metric(topn.recip_rank)
rla.add_metric(topn.ndcg)
rla.add_metric(topn.dcg)
results = rla.compute(all_recs_als, test_data)
results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,28170,10,0.0,,
ALS,56680,10,0.0,0.0,0.0
ALS,108460,10,0.0,0.0,0.0
ALS,139830,10,0.0,,
ALS,198430,10,0.0,,


## Manual Prediction Checking

1. Why are ndcg and dcg nulls? EG: user 28170

In [None]:
test_data[test_data.user == 28170]

Unnamed: 0,user,item,rating


In [None]:
all_recs_als[all_recs_als.user == 28170]

Unnamed: 0,user,rank,item,score,algorithm
0,28170,0,166172,5.912496,ALS
1,28170,1,128716,5.843215,ALS
2,28170,2,280255,5.796579,ALS
3,28170,3,243882,5.796256,ALS
4,28170,4,225101,5.791911,ALS
5,28170,5,16768,5.784569,ALS
6,28170,6,325042,5.72964,ALS
7,28170,7,379378,5.725366,ALS
8,28170,8,89272,5.679644,ALS
9,28170,9,209280,5.678403,ALS


User ```28170``` does not appear in the test set. Hence cannot be evaluated. 

2. Why are all metrics 0 for specific users?

In [None]:
test_data[test_data.user == 56680]

Unnamed: 0,user,item,rating
446,56680,229831,5


In [None]:
all_recs_als[all_recs_als.user == 56680]

Unnamed: 0,user,rank,item,score,algorithm
10,56680,0,156550,6.459229,ALS
11,56680,1,201382,6.430128,ALS
12,56680,2,146185,6.143998,ALS
13,56680,3,119604,6.13605,ALS
14,56680,4,329345,6.135118,ALS
15,56680,5,319562,6.132361,ALS
16,56680,6,98791,6.114608,ALS
17,56680,7,16768,6.061062,ALS
18,56680,8,382175,6.043922,ALS
19,56680,9,283173,6.042053,ALS


User-recipe combination does not appear in the recommendations set. 

3. Are any non zero metrics? 

In [None]:
results[results.ndcg > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,1133190,10,0.2,0.123919,1.934264
ALS,865936,10,0.111111,0.02984,1.20412


In [None]:
results[results.dcg > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,1133190,10,0.2,0.123919,1.934264
ALS,865936,10,0.111111,0.02984,1.20412


In [None]:
results[results.recip_rank > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,recip_rank,ndcg,dcg
algorithm,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALS,1133190,10,0.2,0.123919,1.934264
ALS,865936,10,0.111111,0.02984,1.20412


There are few user recipe combination that has a corresponding value in the test set. Hence, we have only few non-zero value of evaluation metrics.


Manually, check one prediction. 

In [None]:
manual_recommendation_check(653438)

[Row(name='big soft sour cream sugar cookies'),
 Row(name='briana s belt buster pot roast'),
 Row(name='carrot cake with lemon icing'),
 Row(name='chicken in a potato nest'),
 Row(name='chicken wings or ribs habanero hot sauce'),
 Row(name='fiesta mexican lasagna'),
 Row(name='linda s ladies town hall cheesecake'),
 Row(name='old fashioned sour cream sugar cookies'),
 Row(name='smoky spinach macaroni and cheese'),
 Row(name='spiced corn with coconut')]

Of the recipes that have been recommended, few appear similar, and few do not. 

## Saving the models and predictions

In [None]:
all_recs_als.to_csv("recommendation_als.csv", 
                    index=False)

In [None]:
model.save('ALS_model.model')