In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=22a9242c7de9f541850d7c7d02f3c01fa3c5187fb477bba5f3bd3ed297f13973
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

In [4]:
data = spark.read.csv('/content/drive/MyDrive/datasets/book_ratings.csv',

inferSchema=True,header=True)

In [5]:
data.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [6]:
data.count()

981756

In [7]:
data.describe().show()

+-------+-----------------+------------------+------------------+
|summary|          book_id|           user_id|            rating|
+-------+-----------------+------------------+------------------+
|  count|           981756|            981756|            981756|
|   mean|4943.275635697668|25616.759933221696|3.8565335989797873|
| stddev|2873.207414896114|15228.338825882167|0.9839408559620033|
|    min|                1|                 1|                 1|
|    max|            10000|             53424|                 5|
+-------+-----------------+------------------+------------------+



In [8]:
train_data, test_data = data.randomSplit([0.8, 0.2])

In [9]:
als = ALS(maxIter=5,
          regParam=0.01,
          userCol="user_id",
          itemCol="book_id",
          ratingCol="rating")

In [10]:
model = als.fit(train_data)

In [11]:
predictions = model.transform(test_data)

In [12]:
predictions.show()

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      1|  32592|     4|  4.295682|
|      1|    588|     5| 4.2512364|
|      1|  16913|     5| 3.7509816|
|      1|  32305|     5| 4.4636183|
|      1|  38475|     4|  4.085602|
|      1|  11927|     4|   5.27649|
|      1|  33065|     4|  4.986938|
|      1|  42404|     5| 4.8027167|
|      1|  21487|     4|   4.20221|
|      1|  16377|     4| 5.1734757|
|      1|  17663|     5| 4.4150023|
|      1|    439|     3| 3.7341223|
|      1|  37284|     5| 3.8447344|
|      1|   1185|     4|  3.946853|
|      1|  33872|     5| 4.4724183|
|      1|  21228|     5|  4.156422|
|      1|  44397|     5|  5.076925|
|      1|  30681|     5|  3.155237|
|      1|  23612|     4|  4.333251|
|      1|  37834|     5| 4.7758417|
+-------+-------+------+----------+
only showing top 20 rows



In [13]:
user1 = test_data.filter(test_data['user_id']==5461).select(['book_id','user_id'])

In [14]:
user1.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|      7|   5461|
|      8|   5461|
|     37|   5461|
|     47|   5461|
|     82|   5461|
|     86|   5461|
|    117|   5461|
|    118|   5461|
|    129|   5461|
|    130|   5461|
|    255|   5461|
|    261|   5461|
|    281|   5461|
|    304|   5461|
|    321|   5461|
|    339|   5461|
|    358|   5461|
|    396|   5461|
|    444|   5461|
|    478|   5461|
+-------+-------+
only showing top 20 rows



In [15]:
user1.count()

42

In [16]:
recommendations = model.transform(user1)

In [17]:
recommendations.orderBy('prediction',ascending=False).show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|      8|   5461| 5.0335436|
|     47|   5461|  4.730926|
|    483|   5461|  4.622107|
|   1094|   5461| 4.5957804|
|    339|   5461| 4.5417233|
|    444|   5461| 4.5225058|
|    478|   5461| 4.4244857|
|   1465|   5461| 4.4110894|
|      7|   5461| 4.3999443|
|    129|   5461| 4.3938417|
|     82|   5461|   4.39313|
|    561|   5461|   4.37797|
|   1202|   5461| 4.3322597|
|   1088|   5461|  4.329598|
|    844|   5461|  4.262401|
|    130|   5461| 4.2086234|
|    885|   5461|  4.141724|
|   4877|   5461| 4.1130195|
|   1493|   5461|  4.111848|
|    304|   5461| 4.1024375|
+-------+-------+----------+
only showing top 20 rows



In [18]:
recommendations.show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|      7|   5461| 4.3999443|
|      8|   5461| 5.0335436|
|     37|   5461| 3.8126752|
|     47|   5461|  4.730926|
|     82|   5461|   4.39313|
|     86|   5461|  3.939124|
|    117|   5461|  3.847703|
|    118|   5461| 3.7317593|
|    129|   5461| 4.3938417|
|    130|   5461| 4.2086234|
|    255|   5461|  3.551429|
|    261|   5461| 3.1461303|
|    281|   5461| 3.1281495|
|    304|   5461| 4.1024375|
|    321|   5461|  3.316253|
|    339|   5461| 4.5417233|
|    358|   5461| 3.5786288|
|    396|   5461| 3.9388258|
|    444|   5461| 4.5225058|
|    478|   5461| 4.4244857|
+-------+-------+----------+
only showing top 20 rows

