In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=e5324b0c46c67760d59ccac23035d0c4ef161884206fa3eec045559c79853497
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [3]:
#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

In [4]:
data = spark.read.csv('/content/book_ratings.csv', inferSchema=True,header=True)

In [5]:
data.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [6]:
data.count()

911138

In [7]:
data.describe().show()

+-------+------------------+-----------------+------------------+
|summary|           book_id|          user_id|            rating|
+-------+------------------+-----------------+------------------+
|  count|            911138|           911138|            911138|
|   mean| 4580.938824854194|25533.08371509036|3.8562468034479958|
| stddev|2658.2302075792495|15209.42702653759| 0.984595699725942|
|    min|                 1|                1|                 1|
|    max|              9236|            53424|                 5|
+-------+------------------+-----------------+------------------+



In [8]:
train_data, test_data = data.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating")
#Fitting the model on the train_data
model = als.fit(train_data)

In [9]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_data)
#Displaying predictions calculated by the model
predictions.show()

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      1|  12471|     5| 4.4253864|
|      1|  16913|     5|   3.81272|
|      2|  19942|     5| 3.9476824|
|      2|  17984|     5| 5.3387175|
|      1|  11927|     4|  4.196174|
|      2|  14603|     4|  3.491901|
|      1|  18313|     5|  4.589915|
|      2|  10111|     5| 4.2789865|
|      1|  13282|     5| 4.0388703|
|      1|  17663|     5| 5.0465817|
|      1|  10246|     4| 4.0988874|
|      2|   1169|     3|  3.339068|
|      1|  24499|     5| 3.6218581|
|      1|  29123|     3| 3.2070491|
|      2|   5885|     4| 3.9502387|
|      1|   9246|     1| 3.6571405|
|      1|  33872|     5|  4.364473|
|      2|  11285|     4|  4.257736|
|      1|  10335|     4| 4.0259557|
|      1|  20848|     5|  4.756963|
+-------+-------+------+----------+
only showing top 20 rows



In [10]:
#Filtering user with user id "5461" with book id on which it has given the reviews
user1 = test_data.filter(test_data['user_id']==5461).select(['book_id','user_id'])
#Displaying user1 data
user1.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|      9|   5461|
|     14|   5461|
|     28|   5461|
|     31|   5461|
|     37|   5461|
|     38|   5461|
|     57|   5461|
|     80|   5461|
|    115|   5461|
|    118|   5461|
|    123|   5461|
|    129|   5461|
|    157|   5461|
|    233|   5461|
|    255|   5461|
|    323|   5461|
|    358|   5461|
|    461|   5461|
|    521|   5461|
|    577|   5461|
+-------+-------+
only showing top 20 rows



In [11]:
user1.count()

35

In [12]:
#Traning and evaluating for user1 with our model trained with the help of training data
recommendations = model.transform(user1)
#Displaying the predictions of books for user1
recommendations.orderBy('prediction',ascending=False).show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|    757|   5461| 4.5889993|
|   1402|   5461| 4.5177164|
|    157|   5461| 4.4611073|
|     37|   5461| 4.4402065|
|    358|   5461| 4.4117694|
|    129|   5461|  4.411402|
|     80|   5461| 4.3915987|
|     28|   5461|  4.379265|
|    461|   5461| 4.3406534|
|   2854|   5461|  4.292382|
|    577|   5461|  4.251686|
|     31|   5461| 4.2478514|
|    844|   5461| 4.1997724|
|   1211|   5461|  4.185651|
|     57|   5461| 4.1813188|
|    115|   5461|  4.127841|
|   4759|   5461|  4.124883|
|    323|   5461| 4.0728273|
|     38|   5461|  4.045744|
|    233|   5461| 4.0447946|
+-------+-------+----------+
only showing top 20 rows



In [13]:
recommendations.show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|      9|   5461| 2.9475477|
|     14|   5461| 4.0005703|
|     28|   5461|  4.379265|
|     31|   5461| 4.2478514|
|     37|   5461| 4.4402065|
|     38|   5461|  4.045744|
|     57|   5461| 4.1813188|
|     80|   5461| 4.3915987|
|    115|   5461|  4.127841|
|    118|   5461| 3.8298192|
|    123|   5461| 3.8117502|
|    129|   5461|  4.411402|
|    157|   5461| 4.4611073|
|    233|   5461| 4.0447946|
|    255|   5461|  3.655685|
|    323|   5461| 4.0728273|
|    358|   5461| 4.4117694|
|    461|   5461| 4.3406534|
|    521|   5461|  3.326486|
|    577|   5461|  4.251686|
+-------+-------+----------+
only showing top 20 rows

