In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=818ad034be3b67a54ded34d7bdc8c990e17eb10d3d2abfcbafc163e3373a1716
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql.functions import *
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.master('local[*]').appName('ratingbooks').getOrCreate()

In [3]:
rate_df = spark.read.options(sep=';', header=True, inferSchema=True).csv('/content/Book-Ratings.csv')
book_df = spark.read.options(sep=';', header=True, inferSchema=True).csv('/content/Books.csv')
user_df = spark.read.options(sep=';', header=True, inferSchema=True).csv('/content/Users.csv')

In [14]:
rate_df.show(10)

+------+----------+----+
|userid|      isbn|rate|
+------+----------+----+
|276725|034545104X|   0|
|276726|0155061224|   5|
|276727|0446520802|   0|
|276729|052165615X|   3|
|276729|0521795028|   6|
|276733|2080674722|   0|
|276736|3257224281|   8|
|276737|0600570967|   6|
|276744|038550120X|   7|
|276745| 342310538|  10|
+------+----------+----+
only showing top 10 rows



In [21]:
rate_df.take(5)

[Row(userid=276725, isbn='034545104X', rate=0),
 Row(userid=276726, isbn='0155061224', rate=5),
 Row(userid=276727, isbn='0446520802', rate=0),
 Row(userid=276729, isbn='052165615X', rate=3),
 Row(userid=276729, isbn='0521795028', rate=6)]

In [20]:
book_df.take(5)

[Row(ISBN='0195153448', BookTitle='Classical Mythology', BookAuthor='Mark P. O. Morford', YearOfPublication=2002, Publisher='Oxford University Press', ImageURLS='http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg', ImageURLM='http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg', ImageURLL='http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg'),
 Row(ISBN='0002005018', BookTitle='Clara Callan', BookAuthor='Richard Bruce Wright', YearOfPublication=2001, Publisher='HarperFlamingo Canada', ImageURLS='http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg', ImageURLM='http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg', ImageURLL='http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg'),
 Row(ISBN='0060973129', BookTitle='Decision in Normandy', BookAuthor="Carlo D'Este", YearOfPublication=1991, Publisher='HarperPerennial', ImageURLS='http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg', ImageURLM='http://images.amazon.com/images/P/00

In [19]:
user_df.take(5)



[Row(UserID=1, USERNAME='bzsufoRTLN2', Location='nyc, new york, usa', Age='NULL'),
 Row(UserID=2, USERNAME='fq7kfHg4VEI', Location='stockton, california, usa', Age='18'),
 Row(UserID=3, USERNAME='W0Hbkd3xR8v', Location='moscow, yukon territory, russia', Age='NULL'),
 Row(UserID=4, USERNAME='W51GahAx5Ap', Location='porto, v.n.gaia, portugal', Age='17'),
 Row(UserID=5, USERNAME='VKN3PQ18GgN', Location='farnborough, hants, united kingdom', Age='NULL')]

In [10]:
user_df = user_df.withColumnRenamed("UserID", "user_id")
book_df = book_df.withColumnRenamed("ISBN", "book_isbn")
rate_df = rate_df.withColumnRenamed("userid", "user_id").withColumnRenamed("isbn", "book_isbn")


In [13]:
joined_df = rate_df.join(user_df, "user_id", "inner") \
                     .join(book_df, "book_isbn", "inner")

In [14]:
avg_rating_df = joined_df.groupBy("book_isbn").agg(avg("rate").alias("book_avg_rate"))

In [15]:
final_df = joined_df.join(avg_rating_df, "book_isbn", "inner")

In [17]:
result_df = final_df.select("USERNAME", "BookTitle", "rate", "book_avg_rate")

In [18]:
result_df.show()

+-----------+--------------------+----+-------------+
|   USERNAME|           BookTitle|rate|book_avg_rate|
+-----------+--------------------+----+-------------+
|6chdqlR3DC7|The Way Things Wo...|   8|          8.0|
|px70uymJ7k6|     Mog's Christmas|   0|          0.0|
|mjteD2ip2Lj|     Mog's Christmas|   0|          0.0|
|cHwJip4Kj4k|                Liar|   9|          9.0|
|6VUiynjA3tV|The Prime of Miss...|   0|          0.0|
|cHwJip4Kj4k|    The Fighting Man|   9|          9.0|
|a0EEWhgtsW8|  First Among Equals|   0|          0.0|
|Tupz6KKVgIq|    Matter Of Honour|   0|          0.0|
|OMqCFWvTBPp|           Kidnapped|   0|          0.0|
|EM5BvtuvZ91|     Brave New World|   9|          9.0|
|cHwJip4Kj4k|     Brave New World|   9|          9.0|
|cHwJip4Kj4k|Nothing Can Be Be...|   0|          0.0|
|SyGcdQu7P4o|        Dark Spectre|   0|          0.0|
|e8Hb8GyY8yN| Pearl and Sir Orfeo|   5|          5.0|
|krizZUAK9f9|Cereus Blooms At ...|   8|          8.0|
|LCv9LN0AQaf|CHESS FOR YOUNG