In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import explode

In [10]:
'''
ALS Model Implementation and Evalution

- Use collaborative filtering for predicting user preferences.
- Leverage distributed data processing with Spark for scalability.
- Evaluate performance using RMSE and other accuracy metrics.
'''

'''
Initialize Spark Session and Load Data
We create a Spark session which enables distributed data processing.
We then load the ratings and movies datasets, inferring schema from the CSVs.
'''
spark = SparkSession.builder.appName("RecommenderSystem").getOrCreate()

# Load ratings
ratings = spark.read.csv("data/ratings.csv", header=True, inferSchema=True)
ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [3]:
movies = spark.read.csv("data/movies.csv", header=True, inferSchema=True)
movies.show(5)


+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [4]:
# Drop any missing ratings (just in case)
ratings = ratings.dropna()

# Split into training and test sets
(training, test) = ratings.randomSplit([0.8, 0.2], seed=42)


In [5]:
'''
Define ALS Model
'''

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    nonnegative=True,
    implicitPrefs=False,  
    coldStartStrategy="drop",
    rank=10,
    maxIter=10,
    regParam=0.1
)

# Train the model
model = als.fit(training)


In [6]:
'''
Generate Predictions and Evaluate the Model
Transform the test data to generate predictions.
Evaluate the predictions using RMSE and MAE.
'''

# Predict on test data
predictions = model.transform(test)

# Evaluate using RMSE
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.4f}")
mae = evaluator.evaluate(predictions)
print(f"Mean Absolute Error = {mae:.4f}")


Root-mean-square error = 0.8763
Mean Absolute Error = 0.8763


In [11]:
'''
Generate Recommendations
Recommend top 5 movies for each user and top 5 users for each movie.
'''

# Recommend top 5 movies for each user
user_recs = model.recommendForAllUsers(5)
user_recs.show(5, truncate=False)

# Recommend top 5 users for each movie
movie_recs = model.recommendForAllItems(5)
movie_recs.show(5, truncate=False)


+------+--------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                   |
+------+--------------------------------------------------------------------------------------------------+
|1     |[{132333, 5.6624475}, {25771, 5.638621}, {53123, 5.633347}, {5915, 5.615384}, {177593, 5.612128}] |
|2     |[{131724, 4.9002333}, {32892, 4.840149}, {84847, 4.764653}, {86377, 4.708737}, {96004, 4.6554093}]|
|3     |[{6835, 4.8810706}, {5746, 4.8810706}, {5181, 4.836154}, {4518, 4.7304206}, {2851, 4.6392922}]    |
|4     |[{158872, 5.1896887}, {2693, 5.115418}, {55276, 5.0818286}, {3846, 5.04871}, {6380, 4.984668}]    |
|5     |[{25771, 5.532932}, {3266, 5.1395226}, {8477, 4.96207}, {71899, 4.909962}, {1262, 4.9055862}]     |
+------+--------------------------------------------------------------------------------------------------+
only showing top 5 rows

+--

In [8]:
# Showing top-N recommendation example for single user
user_id = 123 
user_recs.filter(f"userId == {user_id}").show(truncate=False)


+------+---------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                    |
+------+---------------------------------------------------------------------------------------------------+
|123   |[{33649, 4.9499464}, {184245, 4.810007}, {134796, 4.810007}, {117531, 4.810007}, {86237, 4.810007}]|
+------+---------------------------------------------------------------------------------------------------+



In [9]:
flat_recs = user_recs.withColumn("rec", explode("recommendations")) \
    .select("userId", "rec.movieId", "rec.rating")

flat_recs = flat_recs.join(movies, on="movieId", how="left") \
    .select("userId", "movieId", "title", "rating")

flat_recs.write.mode("overwrite").parquet("data/recommendations.parquet")
