In [1]:
# Step 1: Install PySpark and Set Up in Colab
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=9fb2b5b56c44463aafdf29f91da13479c7b291f8e7685cb30bc987d6ccaed566
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName('BookRecommendationSystem').getOrCreate()

# Step 2: Load Dataset
# Assuming the CSV file is loaded as a DataFrame
df = spark.read.csv("book_ratings.csv", header=True, inferSchema=True)

# Check if the columns are correct
df.printSchema()

# Step 3: Train ALS Model (No need to rename columns, use 'user_id', 'book_id', 'rating')
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_id",
    itemCol="book_id",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# Step 4: Fit the model
model = als.fit(df)

# Step 5: Evaluate the Model (using RMSE)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

# Split data into training and test sets for evaluation
train, test = df.randomSplit([0.8, 0.2])
predictions = model.transform(test)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Square Error (RMSE): {rmse}")

# Step 6: Show predictions for a specific user (User ID = 53)
user53_df = df.filter(df['user_id'] == 53)
user53_predictions = model.transform(user53_df)
user53_predictions.orderBy("prediction", ascending=False).show(truncate=False)

# Step 7: Show 5 recommended books for all users
user_recommendations = model.recommendForAllUsers(5)
user_recommendations.show(truncate=False)

# Step 8: Show 5 recommended users for all books
book_recommendations = model.recommendForAllItems(5)
book_recommendations.show(truncate=False)

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)

Root Mean Square Error (RMSE): 0.5955300490198375
+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|8946   |53     |5     |4.3580976 |
|8882   |53     |2     |2.0867662 |
|8336   |53     |1     |1.2003254 |
|8336   |53     |1     |1.2003254 |
+-------+-------+------+----------+

+-------+-----------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                |
+-------+-----------------------------------------------------------------------------------------------+
|1      |[{9842, 4.268772}, {4344, 4.1868205}, {5701, 4.132938}, {5545, 4.1296}, {8028, 4.054927}]      |
|3      |[{7832, 1.0985683}, {7254, 1.0868514}, {6809, 1.0866787}, {4541, 1.081345}, {9566, 1.0794958}] |
|5    