In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F

In [None]:
folder_path = "./als_spark_checkpoints"

# Check if folder exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)   # Create folder
    print(f"Folder created: {folder_path}")
else:
    print(f"Folder already exists: {folder_path}")


In [2]:
spark = (SparkSession
     .builder
     .master('local[*]') # tells you master is 1 laptop using all 4 executors
     .config("spark.driver.memory", "8g")
     .config("spark.executor.memory", "8g")
     .config("spark.sql.shuffle.partitions", "8")  # reduce for local
     .getOrCreate()) # make new or get latest session

spark.sparkContext.setCheckpointDir("./als_spark_checkpoints")

In [3]:
# Read board game geek file on spark
schema = """
_c0 INT,
user STRING,
rating FLOAT,
comment STRING,
id INT, 
name STRING
"""
# Fix quote handling for comments column 
df_spark = spark.read.csv(
    "/mnt/data/public/bgg/bgg-19m-reviews.csv",
    sep=',', header=True,
    schema=schema,
    multiLine=True,
    quote='"',
    escape='"')
df_spark = df_spark.drop("_c0", "comment", "name")

In [4]:
# Map user name to integer
user_indexer = StringIndexer(inputCol="user", outputCol="user_id")
df_spark_indexed = user_indexer.fit(df_spark).transform(df_spark)

# Save Spark DF mapping of user to User ID
user_mapping = df_indexed.select("user", "userId").distinct()
df_spark_indexed = df_spark_indexed.drop("user")

# Change item column name for unformity
df_spark_indexed = df_spark_indexed.withColumnRenamed("id", "item_id")

In [5]:
df_spark_indexed.limit(10).show()

+------+-------+--------+
|rating|item_id| user_id|
+------+-------+--------+
|  10.0|  30549|   201.0|
|  10.0|  30549|  6591.0|
|  10.0|  30549|   631.0|
|  10.0|  30549|  1705.0|
|  10.0|  30549|  5796.0|
|  10.0|  30549|    78.0|
|  10.0|  30549|393225.0|
|  10.0|  30549|233206.0|
|  10.0|  30549| 22517.0|
|  10.0|  30549| 87298.0|
+------+-------+--------+



In [6]:
# Count nulls per column
null_counts = df_spark_indexed.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_spark_indexed.columns])
null_counts.show()

+------+-------+-------+
|rating|item_id|user_id|
+------+-------+-------+
|     0|      0|      0|
+------+-------+-------+



In [7]:
print(spark.sparkContext.getCheckpointDir())  # checkpoint directory

file:/home2/bsdsba2027/rvelasco/BDCC Labs/BDCC_Lab1/als_spark_checkpoints/cd532ad3-973f-4be8-acc1-64f92a7834cb


In [8]:
# Train an ALS model
train, test = df_spark_indexed.randomSplit([0.8, 0.2])
als = ALS(rank=2, maxIter=5, 
          userCol="user_id", itemCol='item_id', 
          ratingCol="rating", coldStartStrategy='drop',
          checkpointInterval=10  # Saves to disk after n iterations
         )
als_model = als.fit(train)

In [10]:
# Evaluate the model by computing the RMSE on the test data
predictions = als_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error:" + str(rmse))

Root-mean-square error:1.2419596757757039


In [11]:
# Generate top 10 movie recommendations for each user
userRecs = als_model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = als_model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = als_model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = als_model.recommendForItemSubset(movies, 10)

NameError: name 'ratings' is not defined

In [14]:
userRecs.limit(3).toPandas()

Unnamed: 0,user_id,recommendations
0,12,"[(149705, 8.133953094482422), (254632, 8.13036..."
1,13,"[(345976, 7.779404163360596), (63170, 7.649724..."
2,14,"[(345976, 8.195059776306152), (277538, 8.04972..."


- https://medium.com/@sinha.raunak/recommendation-systems-pyspark-als-model-evaluation-rmse-map-k-recall-k-ndcg-k-477bf6df893e

- https://github.com/CGrannan/building-boardgame-recommendation-systems/blob/master/spark_als_recommendation.ipynb (but no ndcg@k)

fix the code below tomorrow

In [None]:
# to get rated movie array

from pyspark.sql.functions import collect_list, col, row_number, when
from pyspark.sql.window import Window
from pyspark.ml.evaluation import RankingEvaluator

# extract best performing model 
best_model = cv_model.bestModel

# cross join users & movies to create a list 
# of all users and movie combinations
all_users = dfs_train.select('user').distinct()
all_movies = dfs_train.select('movie').distinct()
users_x_movies = all_users.crossJoin(all_movies)

# get predictions
dfs_preds = best_model.transform(users_x_movies)

# join preds and train dataset to get all preds & ratings
# the unrated user x movie pairs will be NULL 
dfs_preds_and_ratings = dfs_preds.alias('preds').join(
    dfs_train.alias('train'),
    (dfs_preds['user']==dfs_train['user']) & 
    (dfs_preds['movie']==dfs_train['movie']),
    how='outer')

# filter out the "seen" movies from prediction
# get preds for unrated user x movie pairs 
# using rating col which will contain NULLs
dfs_preds_final = dfs_preds_and_ratings.filter(
    col('train.rating').isNull()
).select('preds.user', 'preds.movie', 'preds.prediction')

# threshold for filtering predicted ratings & actual ratings
thresh = 4.0 

# filter predictions using threshold
# rank order the predictions by predicted ratings
dfs_preds_thresh_ranked = dfs_preds_final.filter(
    col('prediction') >= thresh
    ).orderBy('user', col('prediction').desc())
dfs_preds_thresh_ranked_grouped = dfs_preds_thresh_ranked.groupBy('user').agg(
    collect_list(col('movie').cast('double')).alias('predicted_movie_arr')
    )

# filter test dataset using threshold
# rank order the test dataset by predicted ratings
dfs_test_thresh_ranked = dfs_test.filter(
    col('rating') >= thresh
    ).orderBy('user', col('rating').desc())
dfs_test_thresh_ranked_grouped = dfs_test_thresh_ranked.groupBy('user').agg(
    collect_list(col('movie').cast('double')).alias('rated_movie_arr')
)

# inner join ranked test dataset with predictions 
# for every user to get two columns per user 
dfs_preds_thresh_for_eval = dfs_test_thresh_ranked_grouped.join(dfs_preds_thresh_ranked_grouped, on='user', how='inner')

In [None]:
evaluator = RankingEvaluator(
    labelCol='rated_movie_arr', 
    predictionCol='predicted_movie_arr', 
    metricName='ndcgAtK', 
    k=10
)
ndcg_k = evaluator.evaluate(dfs_preds_thresh_for_eval)
print(f"NDCG at k={k} : {ndcg_k}")