In [1]:
from google.colab import files

In [2]:
!pip -q install kaggle pyspark pyarrow

In [3]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{\n  "username": "vadimsokol",\n  "key": "KGAT_1943f2f81f04e39172723dd31c2ac3c0"\n}\n'}

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [5]:
!kaggle datasets list | head


ref                                                                 title                                                     size  lastUpdated                 downloadCount  voteCount  usabilityRating  
------------------------------------------------------------------  --------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
saidaminsaidaxmadov/chocolate-sales                                 Chocolate Sales                                         468320  2026-01-04 14:23:35.490000          10551        172  1.0              
jayjoshi37/customer-subscription-churn-and-usage-patterns           Customer Subscription Churn and Usage Patterns           34246  2026-01-27 13:53:52.857000            655         23  1.0              
vishardmehta/indian-engineering-college-placement-dataset           Indian Engineering College Placement Dataset            137603  2026-01-24 15:23:40.150000           1622         44

In [None]:
DATASET = "mohamedbakhet/amazon-books-reviews"

!mkdir -p /content/data
!kaggle datasets download -d {DATASET} -p /content/data --unzip

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0


In [7]:
!ls -lah /content/data


total 2.9G
drwxr-xr-x 2 root root 4.0K Feb  1 11:32 .
drwxr-xr-x 1 root root 4.0K Feb  1 11:31 ..
-rw-r--r-- 1 root root 173M Feb  1 11:32 books_data.csv
-rw-r--r-- 1 root root 2.7G Feb  1 11:32 Books_rating.csv


In [8]:
DATA_DIR = "/content/data"
RATINGS_CSV = f"{DATA_DIR}/Books_rating.csv"
BOOKS_CSV = f"{DATA_DIR}/books_data.csv"


USE_FULL_DATA = False # just to save runtime
# 5 percent sample,
# TODO: try 10% later
SAMPLE_FRAC = 0.05
SEED = 12345




In [9]:
# Start the SPark session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("amd-recsys-books")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")


In [10]:
# Load book ratings into Spark:
from pyspark.sql import functions as F

ratings = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(RATINGS_CSV)
)

ratings.printSchema()
ratings.show(5, truncate=False)


root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- profileName: string (nullable = true)
 |-- review/helpfulness: string (nullable = true)
 |-- review/score: string (nullable = true)
 |-- review/time: string (nullable = true)
 |-- review/summary: string (nullable = true)
 |-- review/text: string (nullable = true)

+----------+------------------------------+-----+--------------+--------------------------------------+------------------+------------+-----------+-----------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
# check the columns to understand what's inside

cols = {c.lower(): c for c in ratings.columns}
cols

{'id': 'Id',
 'title': 'Title',
 'price': 'Price',
 'user_id': 'User_id',
 'profilename': 'profileName',
 'review/helpfulness': 'review/helpfulness',
 'review/score': 'review/score',
 'review/time': 'review/time',
 'review/summary': 'review/summary',
 'review/text': 'review/text'}

In [16]:
# Normalize and select core columns
from pyspark.sql import functions as functions

score_string = functions.regexp_extract(functions.col("review/score"), r"([0-9]+(\.[0-9]+)?)", 1)

ratings_0 = (
    ratings
    .select(
        functions.col("User_id").alias("user_id"),
        functions.col("Id").alias("book_id"),
        # safe numeric extraction for odd strings
        functions.when(score_string != "", score_string.cast("double")).otherwise(F.lit(None).cast("double")).alias("rating"),
        functions.col("Title").alias("title"),
    )
    .dropna(subset=["user_id", "book_id", "rating"])
)

ratings_0.select("rating").describe().show()
ratings_0.show(3, truncate=False)


+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|           2433202|
|   mean|2029.6013891366192|
| stddev|1580342.9802411501|
|    min|               0.0|
|    max|        1.295568E9|
+-------+------------------+

+--------------+----------+------+------------------------------+
|user_id       |book_id   |rating|title                         |
+--------------+----------+------+------------------------------+
|AVCGYZL8FQQTD |1882931173|4.0   |Its Only Art If Its Well Hung!|
|A30TK6U7DNS82R|0826414346|5.0   |Dr. Seuss: American Icon      |
|A3UH4UZ4RSVO82|0826414346|5.0   |Dr. Seuss: American Icon      |
+--------------+----------+------+------------------------------+
only showing top 3 rows


In [17]:
# clear bad formed data for ratings, first got 10^8 score

# keep only realistic star ratings
ratings_1 = ratings_0.filter((F.col("rating") >= 1.0) & (F.col("rating") <= 5.0))

ratings_1.select("rating").describe().show()


+-------+-----------------+
|summary|           rating|
+-------+-----------------+
|  count|          2426292|
|   mean|4.222532572336718|
| stddev|1.183760589211074|
|    min|              1.0|
|    max|              5.0|
+-------+-----------------+



In [18]:
ratings.select("review/score").show(5, truncate=False)
ratings.select("review/helpfulness").show(5, truncate=False)


+------------+
|review/score|
+------------+
|4.0         |
|5.0         |
|5.0         |
|4.0         |
|4.0         |
+------------+
only showing top 5 rows
+------------------+
|review/helpfulness|
+------------------+
|7/7               |
|10/10             |
|10/11             |
|7/7               |
|3/3               |
+------------------+
only showing top 5 rows


In [23]:
# Sample and reduce sparcity, for speed
USE_FULL_DATA = False
SAMPLE_FRAC = 0.10
SEED = 12345

ratings = ratings_1
if not USE_FULL_DATA:
    ratings = ratings.sample(False, SAMPLE_FRAC, seed=SEED)

MIN_USER_RATINGS = 5
MIN_BOOK_RATINGS = 5

user_count = ratings.groupBy("user_id").count().withColumnRenamed("count", "user_count")
book_count = ratings.groupBy("book_id").count().withColumnRenamed("count", "book_count")

ratings = (
    ratings.join(user_count, "user_id")
     .join(book_count, "book_id")
     .filter((functions.col("user_count") >= MIN_USER_RATINGS) & (functions.col("book_count") >= MIN_BOOK_RATINGS))
     .select("user_id", "book_id", "rating", "title")
)

print("rows:", ratings.count())
print("users:", ratings.select("user_id").distinct().count())
print("books:", ratings.select("book_id").distinct().count())

ratings.cache()


rows: 25138
users: 3636
books: 6259


DataFrame[user_id: string, book_id: string, rating: double, title: string]

In [24]:
from pyspark.sql import functions as F

ratings.groupBy("book_id").count().orderBy(F.col("count").desc()).show(20)
ratings.groupBy("user_id").count().orderBy(F.col("count").desc()).show(20)


+----------+-----+
|   book_id|count|
+----------+-----+
|B000NWQXBA|   49|
|B000Q032UY|   46|
|0141804459|   44|
|1566190932|   42|
|B000PMCF1A|   42|
|1593355548|   40|
|B000PC54NG|   40|
|B000EVI8O0|   40|
|0460872702|   40|
|B000F6H01Q|   39|
|B000NWU3I4|   39|
|1901768945|   38|
|1844560333|   38|
|0786135034|   38|
|8188280046|   38|
|B0007C10MS|   37|
|B000GQG5MA|   37|
|B000ILIJE0|   37|
|B000FFQ85G|   36|
|B000NOX190|   35|
+----------+-----+
only showing top 20 rows
+--------------+-----+
|       user_id|count|
+--------------+-----+
|A1D2C0WDCSHUWZ|  271|
|A20EEWWSFMZ1PN|   96|
|   AFVQZQ8PW0L|   90|
|A1X8VZWTOG8IS6|   87|
|A14OJS0VWMOSWO|   79|
|A1K1JW1C5CUSUZ|   71|
| AHD101501WCN1|   67|
|A1G37DFO8MQW0M|   64|
|A1N1YEMTI9DJ86|   64|
|A3QVAKVRAH657N|   58|
|A1T17LMQABMBN5|   55|
|A1EKTLUL24HDG8|   54|
|A3LKWMM12AF0PU|   54|
|A3O2RCKAMSE9X7|   52|
| AHXAPVSHPJ6OJ|   52|
|A1L43KWWR05PCS|   51|
|A2F6N60Z96CAJI|   50|
| AJQ1S39GZBKUG|   50|
|A22DUZU3XVA8HA|   49|
|A3OH101U0CPU

In [25]:
# Baseline recommender by popularity
# Check it to compare later

MIN_REVIEWS_FOR_POPULARITY = 10

popular = (
    ratings.groupBy("book_id", "title")
     .agg(functions.count("*").alias("n"), functions.avg("rating").alias("average_rating"))
     .filter(functions.col("n") >= MIN_REVIEWS_FOR_POPULARITY)
     .orderBy(functions.col("average_rating").desc(), functions.col("n").desc())
)

popular.show(20, truncate=False)


+----------+-----------------------------------------------------------------------------------------------------------------+---+-----------------+
|book_id   |title                                                                                                            |n  |average_rating   |
+----------+-----------------------------------------------------------------------------------------------------------------+---+-----------------+
|9626346825|A Christmas Carol (Classic Fiction)                                                                              |20 |5.0              |
|B000I1VJLA|The Lord of the Rings Box Set                                                                                    |18 |5.0              |
|B000TNGU5M|Night                                                                                                            |10 |5.0              |
|B000NOXDUC|The Rise and Fall of the Third Reich - 2 Volumes Eastern Press Leather (A History of Nazi Germ

In [26]:
# Train test split
train_df, test_df = ratings.randomSplit([0.8, 0.2], seed=SEED)

train_df = train_df.cache()
test_df  = test_df.cache()

print("train:", train_df.count(), "test:", test_df.count())


train: 20078 test: 5060
