## Exercise 2

In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import stddev
from pyspark.sql.functions import col
from pyspark.sql.functions import explode
from pyspark.sql.functions import abs


- Load the ratings data: Load the MovieLens dataset into a DataFrame, and convert the user_id, movie_id, and rating columns to the appropriate data types.
- Train and evaluate the model: Split the dataset into training and test sets, and use the training set to train an ALS model. Evaluate the model using the test set, and compute the RMSE, MAE, and hit rate metrics.
- Generate recommendations: Use the trained model to generate top K movie recommendations for each user, and display the recommendations for a sample of users.

In [21]:
spark = SparkSession.builder.appName('movie-recommender').getOrCreate()

ratings_df = spark.read \
    .format("csv") \
    .option("header", "false") \
    .option("delimiter", "\t") \
    .load("ml-100k/u.data") \
    .toDF("user_id", "movie_id", "rating", "timestamp")

ratings_df = ratings_df \
    .withColumn("user_id", ratings_df["user_id"].cast("integer")) \
    .withColumn("movie_id", ratings_df["movie_id"].cast("integer")) \
    .withColumn("rating", ratings_df["rating"].cast("double"))

(training_data, test_data) = ratings_df.randomSplit([0.9, 0.1])

als = ALS(maxIter=10, regParam=0.01, userCol="user_id", itemCol="movie_id", ratingCol="rating",
          coldStartStrategy="drop")

model = als.fit(training_data)

predictions = model.transform(test_data)
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")
rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
stddev_ = predictions.select(stddev('rating')).collect()[0][0]

threshold = 1  # Set the threshold for hit or miss

# Calculate the percentage of correct predictions
correct_predictions = predictions.filter(abs(predictions["rating"] - predictions["prediction"]) <= threshold).count()
total_predictions = predictions.count()
hit_rate = float(correct_predictions) / total_predictions

print("Root-mean-square error = " + str(rmse))
print("Mean absolute error = " + str(mae))
print("Standard deviation of ratings = " + str(stddev_))
print("Hit rate = {:.2%}".format(hit_rate))

userRecs = model.recommendForAllUsers(10)

userRecs = userRecs.select('user_id', explode('recommendations').alias('rec'))
userRecs = userRecs.select('user_id', 'rec.movie_id', 'rec.rating')

userRecs.show()


Root-mean-square error = 1.0511155761229622
Mean absolute error = 0.7916485699068206
Standard deviation of ratings = 1.1283944757275661
Hit rate = 70.87%




+-------+--------+---------+
|user_id|movie_id|   rating|
+-------+--------+---------+
|      1|    1205|6.9682846|
|      1|    1176| 6.199212|
|      1|     793|6.1119885|
|      1|     390| 5.899417|
|      1|    1062| 5.842989|
|      1|     611| 5.723258|
|      1|     647|5.6879067|
|      1|     745|5.6599145|
|      1|    1141|5.5994134|
|      1|     408|5.5190144|
|      3|     767| 5.992915|
|      3|    1598|5.5911484|
|      3|     960|5.5557647|
|      3|    1114|5.4302955|
|      3|     962|5.1392417|
|      3|     817|5.1271434|
|      3|     880| 5.072099|
|      3|     854| 5.070144|
|      3|     741|5.0638375|
|      3|    1470| 5.050608|
+-------+--------+---------+
only showing top 20 rows



                                                                                