In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Start Spark session
spark = SparkSession.builder.appName("MovieRecommendationSystem").getOrCreate()

# Load the data
ratings_path = "/root/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/ratings.csv"
movies_path = "/root/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/movies.csv"

ratings = spark.read.csv(ratings_path, header=True, inferSchema=True)
movies = spark.read.csv(movies_path, header=True, inferSchema=True)

# Preprocess ratings
ratings = ratings.select("userId", "movieId", "rating")

#  Split into training and test sets
(train, test) = ratings.randomSplit([0.8, 0.2], seed=42)

#  Build the ALS Model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    implicitPrefs=False,
    maxIter=10,
    regParam=0.1,
    rank=10
)

model = als.fit(train)

#  Predict on test set
predictions = model.transform(test)

#  Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"📉 Root-mean-square error = {rmse:.4f}")

#  Generate top 10 movie recommendations for each user
user_recommendations = model.recommendForAllUsers(10)

#  show recommendations
users = [i for i in range(1,6)]
user_with_titles=[]
for i in users:
  user_recs = user_recommendations.filter(col("userId") == i)
  user_recs.show(truncate=False)

  #  Join with movie titles
  from pyspark.sql.functions import explode

  user_exploded = user_recs.select("userId", explode("recommendations").alias("rec"))
  user_movies = user_exploded.select("userId", col("rec.movieId").alias("movieId"), col("rec.rating").alias("pred_rating"))
  user_with_titles.append(user_movies.join(movies, on="movieId").select("title", "pred_rating"))

  print(f"🎬 Top 10 recommended movies for user{i}:")
  user_with_titles[i-1].show(truncate=False)


In [None]:
import matplotlib.pyplot as plt
for i in users:

  pdf = user_with_titles[i-1].toPandas()
  pdf.sort_values("pred_rating", ascending=False).plot(kind="barh", x="title", y="pred_rating", figsize=(8,8), color="skyblue")
  plt.xlabel("Predicted Rating")
  plt.title(f"Top 10 Movie Recommendations for User {i}")
  plt.gca().invert_yaxis()
  plt.tight_layout()
  plt.show()
