In [31]:
#Toy Recommender System Using Spark
#Based on collaborative filtering
#Using the alternating least square algorithm from spark's ML library
#Data is based on movie lens data session
#Codes are based on Jose Portilla's Spark Course
#Import Relevant Data
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


In [8]:
#Initiate spark session
#obtain data
spark=SparkSession.builder.appName('rec').getOrCreate()
data=spark.read.csv('movielens_ratings_data.csv',inferSchema=True,header=True)

In [10]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [17]:
#Modeling
train,test=data.randomSplit([0.8,0.2])
als=ALS(maxIter=5,regParam=0.01,userCol='userId',itemCol='movieId',ratingCol='rating')
model=als.fit(train)
predictions=model.transform(test)



In [18]:
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    27|0.36552268|
|     31|   1.0|    19| 1.6609957|
|     31|   1.0|     4| 1.4980947|
|     31|   1.0|    18| 2.0550027|
|     85|   1.0|    12| 2.2940254|
|     85|   5.0|    16| -3.184245|
|     65|   1.0|    22|-0.5020368|
|     53|   1.0|     9| 2.1625826|
|     78|   1.0|    13|  0.846105|
|     34|   1.0|    16| 0.7258266|
|     34|   3.0|     3| 1.0973988|
|     34|   3.0|    25| 1.0676314|
|     34|   4.0|     2| 0.7606168|
|     81|   5.0|    28| 2.4630466|
|     81|   1.0|     1|  1.178612|
|     81|   2.0|     5| 2.0471745|
|     28|   1.0|    14|0.18764052|
|     26|   1.0|     6| 3.6371083|
|     26|   1.0|    19|  1.368707|
|     26|   1.0|     7|   2.27824|
+-------+------+------+----------+
only showing top 20 rows



In [19]:
evaluator=RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')

In [20]:
RMSE=evaluator.evaluate(predictions)

In [25]:
print('Root Mean Square Error is :',RMSE)

Root Mean Square Error is : 1.6013779530803363


# Comments

Root mean square can be improved upon if we use a larger data set.
Next we look at the case of a single user.

In [27]:
#Next we look at a single user, user with ID no. 12
#For this ID we want to recommend  movies to watch for them.
single_user=test.filter(test['userId']==12).select(['movieId','userId'])

In [28]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      7|    12|
|     18|    12|
|     23|    12|
|     24|    12|
|     25|    12|
|     41|    12|
|     44|    12|
|     50|    12|
|     60|    12|
|     63|    12|
|     67|    12|
|     72|    12|
|     79|    12|
|     83|    12|
|     85|    12|
|     92|    12|
+-------+------+



In [30]:
recommendations=model.transform(single_user)
recommendations.orderBy('prediction',ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     25|    12|   4.151555|
|     67|    12|  3.9192762|
|     50|    12|  3.0540233|
|     18|    12|   2.768509|
|      7|    12|   2.665512|
|     85|    12|  2.2940254|
|     79|    12|  2.0323036|
|     41|    12|  1.9204357|
|     63|    12|  1.9185566|
|     72|    12|  0.9331144|
|     60|    12| 0.49837595|
|     92|    12| 0.45102525|
|     23|    12| 0.37101054|
|     24|    12| 0.37053138|
|     44|    12| -0.5245247|
|     83|    12|-0.94931483|
+-------+------+-----------+



# Comments
*Based on this outcome, user ID 18 is similar to user ID 18 and the next movie they should see is probably movie 25.
What should be done to users who are new to the platform (cold start problem)?
