In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator, RankingEvaluator
from pyspark.sql import functions as F
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Window

In [3]:
spark = (SparkSession
     .builder
     .master('local[*]') # tells you master is 1 laptop using all 4 executors
     .config("spark.driver.memory", "8g")
     .config("spark.executor.memory", "8g")
     .config("spark.sql.shuffle.partitions", "8")  # reduce for local
     .getOrCreate()) # make new or get latest session

spark.sparkContext.setCheckpointDir("./als_spark_checkpoints")

In [4]:
folder_path = "./df_spark_indexed.parquet"

# Check if folder exists
if not os.path.exists(folder_path):
    # Read board game geek file on spark
    schema = """
    _c0 INT,
    user STRING,
    rating FLOAT,
    comment STRING,
    id INT, 
    name STRING
    """
    # Fix quote handling for comments column 
    df_spark = spark.read.csv(
        "/mnt/data/public/bgg/bgg-19m-reviews.csv",
        sep=',', header=True,
        schema=schema,
        multiLine=True,
        quote='"',
        escape='"')
    df_spark = df_spark.drop("_c0", "comment", "name")

    # Get top 500 games by reviews
    df_top_items = (
        df_spark.groupby("ID")
        .count()    
        .orderBy(F.desc("count"))
        .limit(500)
    )
    
    # Filter to top 500 games first
    df_filter = df_spark.join(
        df_top_items.select("ID"), 
        on="ID", 
        how="inner"
    )
    
    # Get users who reviewed more than 44 games (within top 500)
    df_active_users = (
        df_filter.groupby("user")
        .count()
        .filter(F.col("count") > 44)
    )
    
    # Filter to only active users
    df_filter2 = df_filter.join(
        df_active_users.select("user"), 
        on="user", 
        how="inner"
    )

    # Map user name to integer
    user_indexer = StringIndexer(inputCol="user", outputCol="user_id")
    df_spark_indexed = user_indexer.fit(df_filter2).transform(df_filter2)
    
    # Save Spark DF mapping of user to User ID
    user_mapping = df_spark_indexed.select("user", "user_id").distinct()
    df_spark_indexed = df_spark_indexed.drop("user")
    
    # Change item column name for unformity
    df_spark_indexed = df_spark_indexed.withColumnRenamed("id", "item_id")
    
    (df_spark_indexed
     .write
     .parquet("df_spark_indexed.parquet"))

df_spark_indexed = spark.read.parquet(folder_path)
print(f"Number of reviews {df_spark_indexed.count()}")

Number of reviews 6166422


In [None]:
train, test = df_spark_indexed.randomSplit([0.8, 0.2], seed=42)

evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating",
    predictionCol="prediction"
)

# Create ALS instance
als = ALS(
    userCol='user_id', 
    itemCol='item_id', 
    ratingCol='rating', 
    coldStartStrategy='drop',
    seed=42
)

# Build parameter grid with BOTH regParam and rank
param_grid = ParamGridBuilder()\
    .addGrid(als.regParam, [.01, .005, .001])\
    .addGrid(als.rank, [2, 4, 6])\
    .build()

# Cross validation
cv = CrossValidator(
    estimator=als, 
    estimatorParamMaps=param_grid, 
    evaluator=evaluator,
    numFolds=3,
    parallelism=4,
    seed=42
)

# Fit and evaluate
print("Training with grid search...")
tuned_model = cv.fit(train)
predictions = tuned_model.transform(test)

# Evaluate RMSE
rmse = evaluator.evaluate(predictions)

# Get best parameters
best_rank = tuned_model.bestModel.rank
best_reg = tuned_model.bestModel._java_obj.parent().getRegParam()

print(f"Best regParam: {best_reg}")
print(f"Best rank: {best_rank}")
print(f"Test RMSE: {rmse:.4f}")

Training with grid search...


In [None]:
best_model = tuned_model.bestModel
best_model.save("als_model")

In [None]:
k = 3

# get predicted ids per user
pred = (best_model.recommendForAllUsers(k)
    .select(
    F.col("user_id"),
    F.col("recommendations.item_id").alias("pred_array")
    ).withColumn("pred_array",
                 F.col("pred_array").cast("array<double>")
                )
       )

# Add rank per user based on rating descending
window = Window.partitionBy("user_id").orderBy(F.desc("rating"))

actual = (test.withColumn("rank", F.row_number().over(window))
          .filter(F.col("rank") <= k)
          .groupBy("user_id")
          .agg(F.collect_list("item_id").alias("actual_array"))
          .withColumn("actual_array",
                 F.col("actual_array").cast("array<double>")
                )
         )

df_ndcg = pred.join(actual, on = 'user_id')


In [None]:
df_ndcg.limit(3).show(truncate = False)

In [None]:
# Get NDGC@K=3
evaluator = RankingEvaluator(
    labelCol='actual_array',
    predictionCol='pred_array',
    metricName='ndcgAtK',
    k=3
)

ndcg_k = evaluator.evaluate(df_ndcg)
print(f"NDCG at k=3: {ndcg_k}")