In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Recommender System').getOrCreate()

In [2]:
from pyspark.ml.recommendation import ALS

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
data=spark.read.csv('movielens_ratings.csv', inferSchema=True, header= True)

In [8]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [9]:
#Divide the data into train and test 
train,test=data.randomSplit([0.7,0.3])

In [10]:
als=ALS(regParam=0.01, maxIter=5, userCol='userId' , itemCol='movieId'  , ratingCol='rating')

In [11]:
model=als.fit(train)

In [12]:
prediction=model.transform(test)

In [13]:
prediction.show()
#Since rating is continous, prediction can give negative rating to show complete dislike 

+-------+------+------+------------+
|movieId|rating|userId|  prediction|
+-------+------+------+------------+
|     31|   1.0|    26|   1.1658027|
|     31|   4.0|    12|   1.2598867|
|     31|   1.0|     5|0.0069105327|
|     31|   3.0|     8| -0.02229669|
|     31|   3.0|     7|  0.45587373|
|     31|   2.0|    25|  0.66239434|
|     31|   1.0|    29|  0.69301754|
|     31|   1.0|    18|   0.1922757|
|     85|   1.0|     5|    3.178989|
|     85|   1.0|     4|   1.0739911|
|     65|   1.0|    19|   0.8025398|
|     65|   2.0|    15|  -1.3817253|
|     65|   1.0|    24|   2.3068101|
|     53|   1.0|    23|   3.8777764|
|     53|   1.0|     7|   1.9206043|
|     53|   1.0|    25|   1.6409466|
|     53|   3.0|    14|  -1.9544158|
|     78|   1.0|    11|   1.2330363|
|     78|   1.0|     2|   1.3188988|
|     34|   1.0|    16|    5.094778|
+-------+------+------+------------+
only showing top 20 rows



In [19]:
# Evaluate for continuous prediction using root mean squared error 
evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [21]:
rmse=evaluator.evaluate(prediction)

In [22]:
rmse

2.264286479208379

In [25]:
#This data had multiple user, lets use this model on only single user 
single_user=test.filter(test['userId']==1).select(['movieId', 'userId'])

In [27]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      2|     1|
|      4|     1|
|      9|     1|
|     13|     1|
|     19|     1|
|     33|     1|
|     37|     1|
|     44|     1|
|     47|     1|
|     54|     1|
|     60|     1|
|     63|     1|
|     70|     1|
|     74|     1|
|     76|     1|
|     77|     1|
|     81|     1|
|     82|     1|
|     86|     1|
|     88|     1|
+-------+------+
only showing top 20 rows



In [28]:
recommendation=model.transform(single_user)

In [30]:
recommendation.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     77|     1|  5.0453186|
|     88|     1|  4.6632123|
|     70|     1|  3.2177937|
|     92|     1|  3.1790814|
|      4|     1|  3.0790188|
|     13|     1|  2.8388832|
|     60|     1|  2.4209964|
|     74|     1|   2.075759|
|     81|     1|  1.5793107|
|     44|     1|  1.4867926|
|     37|     1|  1.0968966|
|     33|     1|  1.0275334|
|     86|     1|  0.9054602|
|     93|     1| 0.89568585|
|     63|     1|  0.8154844|
|     54|     1|  0.6966945|
|     82|     1| 0.52422667|
|      9|     1|-0.17602533|
|     91|     1| -0.6101417|
|     19|     1|-0.96193635|
+-------+------+-----------+
only showing top 20 rows

