In [1]:
from pyspark.sql import SparkSession 
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [3]:
spark = SparkSession.builder.appName("Recommender Systems").getOrCreate()

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

Spark MLlib library for Machine Learning provides a Collaborative Filtering implementation by using Alternating Least Squares. The implementation in MLlib has these parameters:

numBlocks is the number of blocks used to parallelize computation (set to -1 to auto-configure).
rank is the number of latent factors in the model.
iterations is the number of iterations to run.
lambda specifies the regularization parameter in ALS.
implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data.
alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations.
Let's see this all in action!

In [5]:
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)

In [6]:
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [7]:
data.head()

Row(movieId=2, rating=3.0, userId=0)

In [8]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [9]:
# Smaller dataset so we will use 0.8 / 0.2
(training, test) = data.randomSplit([0.8, 0.2])

In [10]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(training)

Now let's see hwo the model performed!

In [11]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [12]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   2.0|    25| -1.2346957|
|     31|   1.0|    29|  0.9935515|
|     31|   1.0|     0|-0.37449837|
|     85|   2.0|    20|   2.359759|
|     85|   1.0|     5|    3.32846|
|     85|   5.0|     8|  0.6227831|
|     85|   4.0|     7|  2.5793495|
|     65|   1.0|    22|  1.2391639|
|     65|   2.0|    15|  1.1348381|
|     53|   1.0|    12|  1.5529395|
|     53|   1.0|     6|  2.9038048|
|     53|   5.0|    21| 0.89199036|
|     53|   3.0|    14|  0.3927058|
|     34|   1.0|    17|    1.79112|
|     34|   1.0|     4|  1.3677015|
|     34|   3.0|    25|  1.3870817|
|     34|   1.0|    14| 0.15139914|
|     81|   1.0|    22|  1.6306608|
|     81|   1.0|    21| 0.93075997|
|     81|   4.0|    11|  2.0511003|
+-------+------+------+-----------+
only showing top 20 rows



In [13]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.8630446311233873


The RMSE described our error in terms of the stars rating column.

In [14]:
single_user = test.filter(test['userId']==11).select(['movieId','userId'])

In [15]:
# User had 10 ratings in the test data set 
# Realistically this should be some sort of hold out set!
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      9|    11|
|     11|    11|
|     12|    11|
|     30|    11|
|     39|    11|
|     76|    11|
|     79|    11|
|     80|    11|
|     81|    11|
|     86|    11|
|     88|    11|
+-------+------+



In [16]:
reccomendations = model.transform(single_user)

In [17]:
reccomendations.orderBy('prediction',ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     80|    11|  2.9605021|
|     76|    11|  2.8325226|
|     11|    11|  2.1462739|
|     81|    11|  2.0511003|
|     30|    11|  1.8863201|
|     12|    11|  1.5392158|
|      9|    11|   1.422175|
|     79|    11|  1.2804176|
|     86|    11|  0.7060246|
|     39|    11|-0.10580361|
|     88|    11|  -4.815578|
+-------+------+-----------+

