In [20]:
import pandas as pd
from tqdm.notebook import tqdm

In [1]:

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('app').getOrCreate()

seed = 0xDEAD

In [2]:
data = spark.read.csv('data/ratings_with_titles.csv', inferSchema=True, header=True)
data.show()

+-------+------+-------+--------------------+
|user_id|rating|book_id|               title|
+-------+------+-------+--------------------+
| 276725|     0|   2966|Flesh Tones: A Novel|
| 276726|     5| 225829|    Rites of Passage|
| 276727|     0|  11054|        The Notebook|
| 276729|     3| 246854|      Help!: Level 1|
| 276729|     6| 246855|The Amsterdam Con...|
| 276733|     0| 123645|Les Particules El...|
| 276736|     8|      0|                 unk|
| 276737|     6|      1|                 unk|
| 276744|     7|   9295|     A Painted House|
| 276745|    10|      2|                 unk|
| 276746|     0|   2030|           Lightning|
| 276746|     0|    227| Manhattan Hunt Club|
| 276746|     0|   1004|       Dark Paradise|
| 276746|     0|    596|          Night Sins|
| 276746|     0|  87284|         At the Edge|
| 276746|     0|  30985|       Make Them Cry|
| 276747|     9|   4779|Little Altars Eve...|
| 276747|     0|  25797|How Stella Got He...|
| 276747|     0|   7154|     The L

In [3]:
data.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|           user_id|            rating|           book_id|               title|
+-------+------------------+------------------+------------------+--------------------+
|  count|           1149780|           1149780|           1149780|             1149780|
|   mean|140386.39512602412|2.8669501991685364|63138.878847257736|            Infinity|
| stddev| 80562.27771851105|3.8541838592016537| 69620.05184548115|                 NaN|
|    min|                 2|                 0|                 0| A Light in the S...|
|    max|            278854|                10|            271378|   �?�?thique en toc|
+-------+------------------+------------------+------------------+--------------------+



Performing hyper parameter search, with few training epochs

In [40]:
train_dev, test = data.randomSplit([0.75, 0.25], seed=seed)
train, dev = train_dev.randomSplit([0.75, 0.25], seed=seed)

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
results = []

for reg in tqdm([2, 1, 0.1]):
    for rank in tqdm([4, 8, 16, 32], leave=False):
        als = ALS(maxIter=10, regParam=reg, rank=rank, userCol='user_id', itemCol='book_id', ratingCol='rating', seed=seed, coldStartStrategy='drop')
        model = als.fit(train)
        predictions = model.transform(dev)
        rmse = evaluator.evaluate(predictions)
        results.append(dict(rmse=rmse, reg=reg, rank=rank))


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [41]:
pd.DataFrame(results).sort_values(by='rmse')

Unnamed: 0,rmse,reg,rank
7,3.917925,1.0,32
6,3.928993,1.0,16
5,3.936872,1.0,8
4,3.956725,1.0,4
3,4.005948,2.0,32
1,4.006461,2.0,8
2,4.01417,2.0,16
0,4.01722,2.0,4
11,4.23716,0.1,32
10,4.428819,0.1,16


In [50]:
# for large number of iterations (25 =<) StackOverFlow error occures
als = ALS(maxIter=22, regParam=1, rank=32, userCol='user_id', itemCol='book_id', ratingCol='rating', seed=seed, coldStartStrategy='drop')
# train on all data except the test part
model = als.fit(train_dev)

test_preds = model.transform(test)
train_preds = model.transform(train_dev)

test_rmse = evaluator.evaluate(test_preds)
train_rmse = evaluator.evaluate(train_preds)

print(f'Test RMSE :: {test_rmse:.3f}, train RMSE :: {train_rmse:.3f}')

Test RMSE :: 3.823, train RMSE :: 2.189


In [51]:
test_preds.show()

+-------+------+-------+------------------+----------+
|user_id|rating|book_id|             title|prediction|
+-------+------+-------+------------------+----------+
|      8|     5|     12|The Middle Stories|       0.0|
|    626|     0|     26|       Wild Animus|0.35724196|
|   4092|     0|     26|       Wild Animus|0.24809062|
|  11808|     0|     26|       Wild Animus|       0.0|
|  15957|     0|     26|       Wild Animus|0.52199364|
|  17950|     0|     26|       Wild Animus|0.70216787|
|  26583|     0|     26|       Wild Animus| 0.9834663|
|  27115|     0|     26|       Wild Animus|       0.0|
|  32440|     0|     26|       Wild Animus|0.63466966|
|  34823|     0|     26|       Wild Animus|0.51857996|
|  35910|     0|     26|       Wild Animus| 0.8382046|
|  37950|     3|     26|       Wild Animus| 1.0775675|
|  37989|     0|     26|       Wild Animus| 1.3259197|
|  41135|     6|     26|       Wild Animus|       0.0|
|  51803|     1|     26|       Wild Animus| 1.2931714|
|  57412| 

In [52]:
train_preds.show()

+-------+------+-------+------------------+----------+
|user_id|rating|book_id|             title|prediction|
+-------+------+-------+------------------+----------+
|  92861|     0|     12|The Middle Stories|       0.0|
| 276762|     0|     12|               unk|       0.0|
|    970|     0|     26|       Wild Animus|       0.0|
|   1025|     0|     26|       Wild Animus| 1.0538764|
|   1863|     0|     26|       Wild Animus|       0.0|
|   1903|     0|     26|       Wild Animus|0.48066524|
|   5543|     0|     26|       Wild Animus|0.35533288|
|   5916|     0|     26|       Wild Animus| 0.9869093|
|   6095|     0|     26|       Wild Animus|       0.0|
|   8674|     1|     26|       Wild Animus| 0.9887588|
|   8681|     0|     26|       Wild Animus| 1.0587127|
|  10532|     0|     26|       Wild Animus| 1.2420533|
|  12264|     7|     26|       Wild Animus| 3.5771394|
|  14336|     0|     26|       Wild Animus|0.68184763|
|  16999|     0|     26|       Wild Animus| 0.9695797|
|  17890| 

To use the model to predict recommendations for the new user who liked the Lord of the Rings book we would need to retrain the whole model, with this user included. 

In [72]:
# 278854 is from data.describe()
# 105418 id of the Lord of the Rings Trilogy
new_row = [(278854 + 1, 10, 105418, "Lord of the Rings Trilogy")]
user_df = spark.createDataFrame(new_row, train.columns)
# also use all available data
extended_df = data.union(user_df)

als = ALS(maxIter=22, regParam=1, rank=32, userCol='user_id', itemCol='book_id', ratingCol='rating', seed=seed, coldStartStrategy='drop')
model = als.fit(extended_df)

Here we can see that model fitted the user quite well

In [73]:
results = model.transform(user_df)
results.orderBy('prediction', ascending=False).show()

+-------+------+-------+--------------------+----------+
|user_id|rating|book_id|               title|prediction|
+-------+------+-------+--------------------+----------+
| 278855|    10| 105418|Lord of the Rings...|  8.932368|
+-------+------+-------+--------------------+----------+



Lets see the recommedations

In [74]:
from pyspark.sql.functions import lit

books = data[['book_id', 'title']].distinct()
books.withColumn('user_id', lit(278854 + 1)).show()

+-------+--------------------+-------+
|book_id|               title|user_id|
+-------+--------------------+-------+
|     81|                 unk| 278855|
|    113|                 unk| 278855|
|   9558|Prisonniers du pa...| 278855|
|  44552|         Mortal Prey| 278855|
|    261|                 unk| 278855|
|  15001|   Smoke and Mirrors| 278855|
|  24684|  The Delta Decision| 278855|
|  54849|         Daw Fantasy| 278855|
|  38130|        Split Second| 278855|
|  64746|Tides Of Summe (H...| 278855|
|  39625|Perfect Little An...| 278855|
|  90234|         The Sunbird| 278855|
| 247247|         Soho Blues.| 278855|
| 247256|She's Not There :...| 278855|
|   1322|The Stand: Comple...| 278855|
|  27958|           God Knows| 278855|
|    447|                 unk| 278855|
|  87881|The Sunne in Sple...| 278855|
| 108697|          Gatekeeper| 278855|
| 180174|Don't Sit Under t...| 278855|
+-------+--------------------+-------+
only showing top 20 rows



In [75]:
recommendations = model.transform(books.withColumn('user_id', lit(278854 + 1)))
recommendations[['title', 'prediction']].orderBy('prediction', ascending=False).show(50, False)

+-----------------------------------------------------------------------------------------------------------------+----------+
|title                                                                                                            |prediction|
+-----------------------------------------------------------------------------------------------------------------+----------+
|The Potter                                                                                                       |12.527517 |
|Van Gogh Face to Face: The Portraits                                                                             |12.527517 |
|Wuthering Heights (Norton Critical Edition)                                                                      |12.527517 |
|The Pillsbury Cookbook                                                                                           |11.141712 |
|Who Brought the Bread: A Bible Mystery                                                                        

In [58]:
items_ids[0].book_id

115602