In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName('movie_recommendation').getOrCreate()
spark

In [3]:
df = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [4]:
df.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
+-------+------+------+
only showing top 5 rows



In [5]:
df.count()

1501

In [6]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [10]:
train, test = df.randomSplit([0.7,0.3])

In [11]:
train.count(), test.count()

(1077, 424)

In [12]:
train.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|   1.0|     3|
|      0|   1.0|     5|
|      0|   1.0|     8|
|      0|   1.0|    11|
|      0|   1.0|    13|
+-------+------+------+
only showing top 5 rows



In [13]:
test.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|   1.0|     6|
|      0|   1.0|    15|
|      0|   1.0|    19|
|      0|   1.0|    26|
|      1|   1.0|    28|
+-------+------+------+
only showing top 5 rows



In [8]:
als = ALS(userCol='userId',ratingCol='rating',itemCol='movieId')
als

ALS_36bb09a9e678

In [14]:
als_model = als.fit(train)
als_model

ALSModel: uid=ALS_36bb09a9e678, rank=10

In [15]:
predictions = als_model.transform(test)
predictions.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)
 |-- prediction: float (nullable = false)



In [16]:
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|      1|   1.0|    28| 1.6128088|
|      2|   4.0|    28| 2.4710596|
|      3|   1.0|    28| 0.5800271|
|      0|   1.0|    26| 1.6269283|
|      4|   4.0|    26| 1.9392356|
|      3|   1.0|    13| 1.0048172|
|      0|   1.0|     6| 1.2605213|
|      2|   1.0|     3| 1.4575107|
|      0|   1.0|    19| 0.5118731|
|      0|   1.0|    15| 1.0784113|
|      4|   1.0|     9|  1.323154|
|      3|   2.0|     8| 1.0388323|
|      4|   2.0|     8| 1.9129245|
|      4|   1.0|    23|0.78435105|
|      2|   2.0|     7| 2.4162912|
|      4|   1.0|     7| 1.8733845|
|      2|   1.0|    25| 1.6303619|
|      3|   1.0|    21| 1.2347553|
|      3|   3.0|    14| 0.8342831|
|      6|   1.0|     2|  1.234749|
+-------+------+------+----------+
only showing top 20 rows



In [17]:
evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='rating',metricName='rmse')
rmse = evaluator.evaluate(predictions)
print("RMSE of ALS:", rmse)

RMSE of ALS: 1.0176051969384272


## Making recommendations for a single user

In [21]:
single_user = test.filter(test['userId'] == 25).select('movieId','userId','rating')
single_user.show()

+-------+------+------+
|movieId|userId|rating|
+-------+------+------+
|      2|    25|   1.0|
|      7|    25|   1.0|
|     12|    25|   3.0|
|     17|    25|   1.0|
|     41|    25|   1.0|
|     43|    25|   1.0|
|     47|    25|   4.0|
|     71|    25|   4.0|
|     74|    25|   1.0|
|     76|    25|   1.0|
|     85|    25|   1.0|
|     92|    25|   1.0|
+-------+------+------+



In [22]:
recommendations = als_model.transform(single_user)
recommendations.show()

+-------+------+------+----------+
|movieId|userId|rating|prediction|
+-------+------+------+----------+
|     85|    25|   1.0| 1.3198066|
|     76|    25|   1.0|  1.635262|
|     12|    25|   3.0|  1.626037|
|     47|    25|   4.0| 1.4469383|
|     92|    25|   1.0| 2.4022336|
|     41|    25|   1.0| 0.7240495|
|     43|    25|   1.0| 1.4375576|
|     17|    25|   1.0| 1.7582796|
|      7|    25|   1.0| 0.9474437|
|     71|    25|   4.0| 1.5368365|
|      2|    25|   1.0| 1.6303619|
|     74|    25|   1.0| 1.3975834|
+-------+------+------+----------+



In [24]:
recommendations.orderBy('prediction',ascending=False).show()

+-------+------+------+----------+
|movieId|userId|rating|prediction|
+-------+------+------+----------+
|     92|    25|   1.0| 2.4022336|
|     17|    25|   1.0| 1.7582796|
|     76|    25|   1.0|  1.635262|
|      2|    25|   1.0| 1.6303619|
|     12|    25|   3.0|  1.626037|
|     71|    25|   4.0| 1.5368365|
|     47|    25|   4.0| 1.4469383|
|     43|    25|   1.0| 1.4375576|
|     74|    25|   1.0| 1.3975834|
|     85|    25|   1.0| 1.3198066|
|      7|    25|   1.0| 0.9474437|
|     41|    25|   1.0| 0.7240495|
+-------+------+------+----------+



In [23]:
print("RMSE of movie rating prediction:", evaluator.evaluate(recommendations))

RMSE of movie rating prediction: 1.2364583181617885
