In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F

In [3]:
folder_path = "./als_spark_checkpoints"

# Check if folder exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)   # Create folder
    print(f"Folder created: {folder_path}")
else:
    print(f"Folder already exists: {folder_path}")


Folder already exists: ./als_spark_checkpoints


In [4]:
spark = (SparkSession
     .builder
     .master('local[*]') # tells you master is 1 laptop using all 4 executors
     .config("spark.driver.memory", "8g")
     .config("spark.executor.memory", "8g")
     .config("spark.sql.shuffle.partitions", "8")  # reduce for local
     .getOrCreate()) # make new or get latest session

spark.sparkContext.setCheckpointDir("./als_spark_checkpoints")

In [5]:
# Read board game geek file on spark
schema = """
_c0 INT,
user STRING,
rating FLOAT,
comment STRING,
id INT, 
name STRING
"""
# Fix quote handling for comments column 
df_spark = spark.read.csv(
    "/mnt/data/public/bgg/bgg-19m-reviews.csv",
    sep=',', header=True,
    schema=schema,
    multiLine=True,
    quote='"',
    escape='"')
df_spark = df_spark.drop("_c0", "comment", "name")

In [6]:
# Map user name to integer
user_indexer = StringIndexer(inputCol="user", outputCol="user_id")
df_spark_indexed = user_indexer.fit(df_spark).transform(df_spark)

# Save Spark DF mapping of user to User ID
user_mapping = df_spark_indexed.select("user", "user_id").distinct()
df_spark_indexed = df_spark_indexed.drop("user")

# Change item column name for unformity
df_spark_indexed = df_spark_indexed.withColumnRenamed("id", "item_id")

In [None]:
868.3887082741884

In [7]:
df_spark_indexed.limit(10).show()

+------+-------+--------+
|rating|item_id| user_id|
+------+-------+--------+
|  10.0|  30549|   201.0|
|  10.0|  30549|  6591.0|
|  10.0|  30549|   631.0|
|  10.0|  30549|  1705.0|
|  10.0|  30549|  5796.0|
|  10.0|  30549|    78.0|
|  10.0|  30549|393225.0|
|  10.0|  30549|233206.0|
|  10.0|  30549| 22517.0|
|  10.0|  30549| 87298.0|
+------+-------+--------+



In [8]:
# Count nulls per column
null_counts = df_spark_indexed.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_spark_indexed.columns])
null_counts.show()

+------+-------+-------+
|rating|item_id|user_id|
+------+-------+-------+
|     0|      0|      0|
+------+-------+-------+



In [9]:
print(spark.sparkContext.getCheckpointDir())  # checkpoint directory

file:/home2/bsdsba2027/rvelasco/BDCC Labs/BDCC_Lab1/als_spark_checkpoints/4a848607-cc8a-4c22-a48d-f022637b1063


In [10]:
# Train an ALS model
train, test = df_spark_indexed.randomSplit([0.8, 0.2])
als = ALS(rank=2, maxIter=5, 
          userCol="user_id", itemCol='item_id', 
          ratingCol="rating", coldStartStrategy='drop',
          checkpointInterval=10  # Saves to disk after n iterations
         )
als_model = als.fit(train)

In [11]:
# Evaluate the model by computing the RMSE on the test data
predictions = als_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error:" + str(rmse))

Root-mean-square error:1.2393715582711229


In [12]:
from pyspark.sql.functions import collect_list, col, size, slice
from pyspark.ml.evaluation import RankingEvaluator

print("Building evaluation dataset...")

# Get top-10 recommendations
k = 10
userRecs = als_model.recommendForAllUsers(k)

dfs_preds_grouped = userRecs.select(
    col('user_id'),
    col('recommendations.item_id').alias('predicted_item_id_arr')
).withColumn(
    'predicted_item_id_arr',
    col('predicted_item_id_arr').cast('array<double>')
)

# Get actual highly-rated items from test
thresh = 4.0
test_thresh_grouped = test.filter(
    col('rating') >= thresh
).groupBy('user_id').agg(
    collect_list(col('item_id').cast('double')).alias('rated_item_id_arr')
)

# Join predictions with actuals
dfs_preds_thresh_for_eval = test_thresh_grouped.join(
    dfs_preds_grouped,
    on='user_id',
    how='inner'
)

# CRITICAL: Limit array sizes to reduce computation
dfs_preds_limited = dfs_preds_thresh_for_eval.withColumn(
    'rated_item_id_arr',
    slice('rated_item_id_arr', 1, 20)  # Max 20 items
).withColumn(
    'predicted_item_id_arr',
    slice('predicted_item_id_arr', 1, 10)  # Max 10 predictions
)

print("✓ Evaluation dataset ready")

# Sample VERY small first - only 0.5%
print("Sampling 0.5% of users...")
dfs_preds_sampled = dfs_preds_limited.sample(fraction=0.005, seed=42)
dfs_preds_sampled.cache()

sample_count = dfs_preds_sampled.count()
print(f"Sample size: {sample_count:,} users")

if sample_count > 0:
    # Evaluate
    evaluator = RankingEvaluator(
        labelCol='rated_item_id_arr',
        predictionCol='predicted_item_id_arr',
        metricName='ndcgAtK',
        k=3
    )
    
    print("Evaluating NDCG...")
    ndcg_k = evaluator.evaluate(dfs_preds_sampled)
    print(f"\n{'='*50}")
    print(f"NDCG at k=3: {ndcg_k:.6f}")
    print(f"{'='*50}")
    
    # Quick diagnostics
    print("\nArray size check:")
    dfs_preds_sampled.select(
        size('rated_item_id_arr').alias('actual_size'),
        size('predicted_item_id_arr').alias('pred_size')
    ).describe().show()
else:
    print("ERROR: Sample is empty!")

Building evaluation dataset...
✓ Evaluation dataset ready
Sampling 0.5% of users...
Sample size: 1,363 users
Evaluating NDCG...

NDCG at k=3: 0.000434

Array size check:
+-------+------------------+---------+
|summary|       actual_size|pred_size|
+-------+------------------+---------+
|  count|              1363|     1363|
|   mean| 8.268525311812178|     10.0|
| stddev|7.1065790151099115|      0.0|
|    min|                 1|       10|
|    max|                20|       10|
+-------+------------------+---------+

