In [60]:
#Recommender system using PySpark on Movielens Dataset of 100,836. entries.

#importing findspark and applying
import findspark
findspark.init('/home/shoby/spark-2.4.0-bin-hadoop2.7')

#importing pyspark and sparkSession
import pyspark
from pyspark.sql import SparkSession

#creating spark session
spark = SparkSession.builder.appName('rec').getOrCreate()



In [71]:
#Importing ALS - Alternating least squares.
from pyspark.ml.recommendation import ALS

#Importing rand function to randomize the dataset.
from pyspark.sql.functions import rand

#Importing Regression evaluator
from pyspark.ml.evaluation import RegressionEvaluator

In [72]:
#Importing Data, ratings2 contains 100,836. entries of movies ratings. 

data = spark.read.csv('ratings2.csv', inferSchema = True, header = True)

In [73]:
#Importing correspoding movies name file.

movieLabels = spark.read.csv('movies2.csv', inferSchema=True, header = True)

In [74]:
#Checking how data looks like.

data.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [75]:
#Drop timestamp data. I am not using timestamp feature in my analysis.

data = data.drop('timestamp')

In [76]:
#Checking how movieLabels data looks like.

movieLabels.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [77]:
#Drop genres data as not needed.

movieLabels = movieLabels.drop('genres')

In [78]:
#Joining user data and movies labels.

data = data.join(movieLabels, on = 'movieId')

In [80]:
#Checking the data output after joining.

data.show(truncate = False)

+-------+------+------+-----------------------------------------+
|movieId|userId|rating|title                                    |
+-------+------+------+-----------------------------------------+
|1      |1     |4.0   |Toy Story (1995)                         |
|3      |1     |4.0   |Grumpier Old Men (1995)                  |
|6      |1     |4.0   |Heat (1995)                              |
|47     |1     |5.0   |Seven (a.k.a. Se7en) (1995)              |
|50     |1     |5.0   |Usual Suspects, The (1995)               |
|70     |1     |3.0   |From Dusk Till Dawn (1996)               |
|101    |1     |5.0   |Bottle Rocket (1996)                     |
|110    |1     |4.0   |Braveheart (1995)                        |
|151    |1     |5.0   |Rob Roy (1995)                           |
|157    |1     |5.0   |Canadian Bacon (1995)                    |
|163    |1     |5.0   |Desperado (1995)                         |
|216    |1     |5.0   |Billy Madison (1995)                     |
|223    |1

In [81]:
#Randomizing the dataset.

data = data.orderBy(rand())

In [83]:
#Checking results after randomizing the dataset.

data.show(truncate = False)

+-------+------+------+---------------------------------------------+
|movieId|userId|rating|title                                        |
+-------+------+------+---------------------------------------------+
|162350 |596   |4.0   |The Magnificent Seven (2016)                 |
|1258   |288   |3.0   |Shining, The (1980)                          |
|86347  |522   |5.0   |Louis C.K.: Chewed Up (2008)                 |
|2407   |42    |4.0   |Cocoon (1985)                                |
|58347  |563   |2.0   |Penelope (2006)                              |
|7366   |232   |3.0   |Jersey Girl (2004)                           |
|4121   |4     |4.0   |Innerspace (1987)                            |
|184997 |596   |4.0   |Love, Simon (2018)                           |
|318    |274   |4.5   |Shawshank Redemption, The (1994)             |
|296    |5     |5.0   |Pulp Fiction (1994)                          |
|590    |580   |4.0   |Dances with Wolves (1990)                    |
|590    |376   |3.5 

In [84]:
#Checking data's summaries.

data.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|          movieId|            userId|            rating|               title|
+-------+-----------------+------------------+------------------+--------------------+
|  count|           100836|            100836|            100836|              100836|
|   mean| 19435.2957177992|326.12756356856676| 3.501556983616962|                null|
| stddev|35530.98719870018|    182.6184914635|1.0425292390606349|                null|
|    min|                1|                 1|               0.5|"11'09""01 - Sept...|
|    max|           193609|               610|               5.0|À nous la liberté...|
+-------+-----------------+------------------+------------------+--------------------+



In [85]:
#Creating a training and test set with 75%/25% ratio.

training, test = data.randomSplit([0.75,0.25])

In [86]:
#Checking training dataset.

training.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|           movieId|            userId|            rating|               title|
+-------+------------------+------------------+------------------+--------------------+
|  count|             75629|             75629|             75629|               75629|
|   mean| 19314.37260839096| 325.0635073847334|3.5027568789749965|                null|
| stddev|35388.493352585436|182.58459987979663| 1.042964065170905|                null|
|    min|                 1|                 1|               0.5|"11'09""01 - Sept...|
|    max|            193609|               610|               5.0|À nous la liberté...|
+-------+------------------+------------------+------------------+--------------------+



In [87]:
#Checking test dataset.

test.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|           movieId|            userId|            rating|               title|
+-------+------------------+------------------+------------------+--------------------+
|  count|             25207|             25207|             25207|               25207|
|   mean| 19798.10342365216| 329.3200698218749|3.4979569167294797|                null|
| stddev|35953.392138540825|182.68657231809618| 1.041235906132599|                null|
|    min|                 1|                 1|               0.5|  'burbs, The (1989)|
|    max|            193579|               610|               5.0|¡Three Amigos! (1...|
+-------+------------------+------------------+------------------+--------------------+



In [88]:
#Alternating Least Squares (ALS) matrix factorization.
#clearing an ALS instance. setting features - userId|movieId|rating
#tweaking hyper-parameters. maxIter = 10->15, regParam = 0.1 -> 0.17

als = ALS(maxIter=15, regParam=0.17, userCol='userId', itemCol='movieId', ratingCol='rating')

In [89]:
#Creating a model by fitting the training dataset.

model = als.fit(training)

In [90]:
#Capturing predictions by transforming the test dataset using the als model previously created.

predictions = model.transform(test)

In [91]:
#Checking how predictions look.

predictions.show()

+-------+------+------+--------------------+----------+
|movieId|userId|rating|               title|prediction|
+-------+------+------+--------------------+----------+
|    471|   436|   3.0|Hudsucker Proxy, ...| 3.3897457|
|    471|   599|   2.5|Hudsucker Proxy, ...| 2.7040527|
|    471|   603|   4.0|Hudsucker Proxy, ...| 2.7254431|
|    471|   182|   4.5|Hudsucker Proxy, ...| 3.7105718|
|    471|   474|   3.0|Hudsucker Proxy, ...| 3.2239203|
|    471|   500|   1.0|Hudsucker Proxy, ...| 2.5217607|
|    471|   411|   4.0|Hudsucker Proxy, ...| 3.3032596|
|    471|   287|   4.5|Hudsucker Proxy, ...| 2.4577036|
|    833|   609|   3.0|High School High ...|  1.979109|
|    833|   608|   0.5|High School High ...| 1.9887727|
|   1088|   159|   4.0|Dirty Dancing (1987)|  2.846287|
|   1088|   474|   3.5|Dirty Dancing (1987)| 2.9154606|
|   1088|   554|   5.0|Dirty Dancing (1987)| 3.8023505|
|   1088|   563|   4.0|Dirty Dancing (1987)|  3.385432|
|   1088|   489|   4.5|Dirty Dancing (1987)| 3.1

In [92]:
#Dropping NaNs if any

predictions = predictions.dropna()

In [93]:
#Creating an evaluator instance using regression evaluator.
#metric 'rmse' is root mean squared error.

evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')

In [94]:
rmse = evaluator.evaluate(predictions)

In [96]:
print('RMSE')
print (rmse)


RMSE
0.8801775204114638


In [108]:
#Picking up a single user.

test_user = test.filter(test['userId']== 144).select(['userId','movieId', 'title'])

In [109]:
#Checking the test_user movie lists that he watched and rated.

test_user.show(truncate = False)

+------+-------+-------------------------------------------+
|userId|movieId|title                                      |
+------+-------+-------------------------------------------+
|144   |2      |Jumanji (1995)                             |
|144   |19     |Ace Ventura: When Nature Calls (1995)      |
|144   |34     |Babe (1995)                                |
|144   |47     |Seven (a.k.a. Se7en) (1995)                |
|144   |208    |Waterworld (1995)                          |
|144   |223    |Clerks (1994)                              |
|144   |231    |Dumb & Dumber (Dumb and Dumber) (1994)     |
|144   |364    |Lion King, The (1994)                      |
|144   |434    |Cliffhanger (1993)                         |
|144   |440    |Dave (1993)                                |
|144   |500    |Mrs. Doubtfire (1993)                      |
|144   |586    |Home Alone (1990)                          |
|144   |588    |Aladdin (1992)                             |
|144   |592    |Batman (

In [110]:
#Transforming the test user data.

test_recommendation = model.transform(test_user)

In [111]:
#Dropping any NaNs from the recommendation list.

test_recommendation = test_recommendation.dropna()

In [112]:
# furnishing a movies recommendation list.

recommendation_list = test_recommendation.orderBy('prediction', ascending = False)

In [114]:
#Output recommendated movies list for test user.

recommendation_list.show(truncate = False)

+------+-------+-------------------------------------------------------------+----------+
|userId|movieId|title                                                        |prediction|
+------+-------+-------------------------------------------------------------+----------+
|144   |3037   |Little Big Man (1970)                                        |4.107817  |
|144   |4973   |Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)         |4.03395   |
|144   |2959   |Fight Club (1999)                                            |4.0271597 |
|144   |4226   |Memento (2000)                                               |3.9724803 |
|144   |3147   |Green Mile, The (1999)                                       |3.9569452 |
|144   |364    |Lion King, The (1994)                                        |3.900621  |
|144   |1246   |Dead Poets Society (1989)                                    |3.888335  |
|144   |7153   |Lord of the Rings: The Return of the King, The (2003)        |3.854898  |
|144   |47

In [None]:
#Based on our test_user's movie watch list, here are some recommendations.