In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import explode

In [13]:
'''
ALS Model Implementation and Evalution

- Use collaborative filtering for predicting user preferences.
- Leverage distributed data processing with Spark for scalability.
- Evaluate performance using RMSE and other accuracy metrics.
'''

'''
Step 1: Initialize Spark Session and Load Data
We create a Spark session which enables distributed data processing.
We then load the ratings and movies datasets, inferring schema from the CSVs.
'''
# Create Spark session
spark = SparkSession.builder.appName("RecommenderSystem").getOrCreate()

# Load ratings
ratings = spark.read.csv("data/ratings.csv", header=True, inferSchema=True)
ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [14]:
movies = spark.read.csv("data/movies.csv", header=True, inferSchema=True)
movies.show(5)


+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [15]:
# Drop any missing ratings (just in case)
ratings = ratings.dropna()

# Split into training and test sets
(training, test) = ratings.randomSplit([0.8, 0.2], seed=42)


In [16]:
'''
Define ALS Model
'''

# ALS model setup
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    nonnegative=True,
    implicitPrefs=False,  
    coldStartStrategy="drop",
    rank=10,
    maxIter=10,
    regParam=0.1
)

# Train the model
model = als.fit(training)


In [17]:
'''
Generate Predictions and Evaluate the Model
Transform the test data to generate predictions.
Evaluate the predictions using RMSE and MAE.
'''

# Predict on test data
predictions = model.transform(test)

# Evaluate using RMSE
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.4f}")
mae = evaluator.evaluate(predictions)
print(f"Mean Absolute Error = {mae:.4f}")


Root-mean-square error = 0.8775
Mean Absolute Error = 0.8775


In [18]:
'''
Step 5: Generate Recommendations
Recommend top 5 movies for each user and top 5 users for each movie.
'''

# Recommend top 5 movies for each user
user_recs = model.recommendForAllUsers(5)
user_recs.show(5, truncate=False)

# Recommend top 5 users for each movie
movie_recs = model.recommendForAllItems(5)
movie_recs.show(5, truncate=False)


+------+----------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                     |
+------+----------------------------------------------------------------------------------------------------+
|1     |[{25771, 6.0337763}, {96004, 5.8452325}, {3379, 5.8452325}, {58301, 5.7855854}, {177593, 5.7263403}]|
|2     |[{112804, 4.9427156}, {3925, 4.924061}, {131724, 4.9036756}, {84847, 4.807758}, {136469, 4.774132}] |
|3     |[{6835, 4.834569}, {5746, 4.834569}, {5181, 4.804334}, {4518, 4.739325}, {461, 4.6875343}]          |
|4     |[{49932, 5.514815}, {25825, 5.233558}, {95182, 5.010171}, {3851, 4.924841}, {527, 4.8506794}]       |
|5     |[{4642, 4.99642}, {26326, 4.942303}, {25771, 4.895912}, {96004, 4.811766}, {3379, 4.811766}]        |
+------+----------------------------------------------------------------------------------------------------+
only showi

In [19]:
# Showing top-N recommendation example for single user
user_id = 123 
user_recs.filter(f"userId == {user_id}").show(truncate=False)


+------+-------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                  |
+------+-------------------------------------------------------------------------------------------------+
|123   |[{33649, 4.954631}, {171495, 4.7426}, {184245, 4.691401}, {134796, 4.691401}, {117531, 4.691401}]|
+------+-------------------------------------------------------------------------------------------------+



In [22]:
flat_recs = user_recs.withColumn("rec", explode("recommendations")) \
    .select("userId", "rec.movieId", "rec.rating")

flat_recs = flat_recs.join(movies, on="movieId", how="left") \
    .select("userId", "movieId", "title", "rating")

flat_recs.write.mode("overwrite").parquet("data/recommendations.parquet")
