In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode

In [2]:
# Read data
spark = SparkSession.builder.appName("NMF Amazon-data recommenders").getOrCreate()
product_rating = spark.read.csv("./data/amazon.csv", header=True, inferSchema=True)

product_rating.show()

+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|        product_name|            category|discounted_price|actual_price|discount_percentage|rating|rating_count|        about_product|             user_id|           user_name|           review_id|        review_title|      review_content|            img_link|        product_link|
+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B07JW9H4J1|Wayona Nylon Brai...|Computers&Accesso...|            ₹399|      ₹1,099|                64%|   4.2|      2

In [3]:
columns_to_drop = ["category", "discounted_price", "actual_price", "discount_percentage", "rating_count","about_product", "review_id", "review_title", "review_content", "img_link", "product_link"]
merged_df = product_rating.drop(*columns_to_drop)
merged_df.show()

+----------+--------------------+------+--------------------+--------------------+
|product_id|        product_name|rating|             user_id|           user_name|
+----------+--------------------+------+--------------------+--------------------+
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|Manav,Adarsh gupt...|
|B098NS6PVG|Ambrane Unbreakab...|     4|AECPFYFQVRUWC3KGN...|ArdKn,Nirbhay kum...|
|B096MSW6CT|Sounce Fast Phone...|   3.9|AGU3BBQ2V2DDAMOAK...|Kunal,Himanshu,vi...|
|B08HDJ86NZ|boAt Deuce USB 30...|   4.2|AEWAZDZZJLQUYVOVG...|Omkar dhale,JD,HE...|
|B08CF3B7N1|Portronics Konnec...|   4.2|AE3Q6KSUK5P75D5HF...|rahuls6099,Swasat...|
|B08Y1TFSP6|pTron Solero TB30...|   3.9|AEQ2YMXSZWEOHK2EH...|Jayesh,Rajesh k.,...|
|B08WRWPM22|boAt Micro USB 55...|   4.1|AG7C6DAADCTRQJG2B...|Vivek kumar,Amazo...|
|B08DDRGWTJ|MI Usb Type-C Cab...|   4.3|AHW6E5LQ2BDYOIVLA...|Pavan A H,Jayesh ...|
|B008IFXQFU|TP-Link USB WiFi ...|   4.2|AGV3IEFANZCKECFGU...|Azhar JuMan,Aniru...|
|B08

In [4]:
merged_df = merged_df.withColumn("user_id", explode(split(col("user_id"), ","))) \
                     .withColumn("user_name", explode(split(col("user_name"), ",")))

In [5]:
merged_df.show()

+----------+--------------------+------+--------------------+--------------+
|product_id|        product_name|rating|             user_id|     user_name|
+----------+--------------------+------+--------------------+--------------+
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|         Manav|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|  Adarsh gupta|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|       Sundeep|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|S.Sayeed Ahmed|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|jaspreet singh|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|    Khaja moin|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|         Anand|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|    S.ARUMUGAM|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AHMY5CWJMMK5BJRBB...|         Manav|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AHMY5CWJMMK5BJRBB...|  Adarsh gupta|

In [6]:
merged_df = merged_df.na.drop(subset=["rating"])

In [7]:
# check null for all cols:
from pyspark.sql.functions import *
merged_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in merged_df.columns]
    ).show()

merged_df.show(5)
merged_df.printSchema()

+----------+------------+------+-------+---------+
|product_id|product_name|rating|user_id|user_name|
+----------+------------+------+-------+---------+
|         0|           0|     0|      0|        0|
+----------+------------+------+-------+---------+

+----------+--------------------+------+--------------------+--------------+
|product_id|        product_name|rating|             user_id|     user_name|
+----------+--------------------+------+--------------------+--------------+
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|         Manav|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|  Adarsh gupta|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|       Sundeep|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|S.Sayeed Ahmed|
|B07JW9H4J1|Wayona Nylon Brai...|   4.2|AG3D6O4STAQKAY2UV...|jaspreet singh|
+----------+--------------------+------+--------------------+--------------+
only showing top 5 rows

root
 |-- product_id: stri

In [8]:
from pyspark.sql.types import FloatType
merged_df = merged_df.withColumn("rating", col("rating").cast(FloatType()))

In [9]:
# Calculate the mean rating
mean_rating = merged_df.select(mean("rating")).collect()[0][0]

In [10]:
# Fill NaN values with the mean rating
merged_df = merged_df.na.fill(mean_rating, subset=["rating"])

In [11]:
df_train, df_test = merged_df.randomSplit([0.7, 0.3], seed = 96)

In [12]:
df_train_pandas = df_train.toPandas()
df_test_pandas = df_test.toPandas()

from surprise import Reader
reader = Reader(rating_scale=(1, 5))

from surprise import Dataset
data_train = Dataset.load_from_df(df_train_pandas[['user_id', 'product_id', 'rating']], reader)
data_test = Dataset.load_from_df(df_test_pandas[['user_id', 'product_id', 'rating']], reader)

trainset = data_train.build_full_trainset()
testset = data_test.build_full_trainset().build_testset()
from surprise import NMF
algo = NMF(n_epochs=32)
algo.fit(trainset)
knn_predictions = algo.test(testset)

In [13]:
from collections import defaultdict
def get_top_n(predictions, n):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

top_n = get_top_n(knn_predictions, n=8)
print(top_n)

defaultdict(<class 'list'>, {'AEHQQTEDMSXRGSBDDEIH3JF4AOMQ': [('B002PD61Y4', 3.50145856127323), ('B002PD61Y4', 3.50145856127323), ('B002PD61Y4', 3.50145856127323), ('B002PD61Y4', 3.50145856127323), ('B002PD61Y4', 3.50145856127323)], 'AETRIARSUFSMNG5LFJZMW6CBJMMQ': [('B002PD61Y4', 3.4979545708443522), ('B002PD61Y4', 3.4979545708443522), ('B002PD61Y4', 3.4979545708443522), ('B002PD61Y4', 3.4979545708443522), ('B002PD61Y4', 3.4979545708443522)], 'AEXAFY7V2ZRZI2GD2J6KDOWBZUBQ': [('B002PD61Y4', 3.5010107566283457), ('B002PD61Y4', 3.5010107566283457), ('B002PD61Y4', 3.5010107566283457), ('B002PD61Y4', 3.5010107566283457), ('B002PD61Y4', 3.5010107566283457)], 'AG7QMBEFFY2LJJKKEVWMJU2BMNRQ': [('B002PD61Y4', 3.505128395515185), ('B002PD61Y4', 3.505128395515185), ('B002PD61Y4', 3.505128395515185), ('B002PD61Y4', 3.505128395515185)], 'AGA2PZGWMQIRA46VYOTICFE7KCBA': [('B002PD61Y4', 3.500038110883674), ('B002PD61Y4', 3.500038110883674), ('B002PD61Y4', 3.500038110883674), ('B002PD61Y4', 3.5000381108

In [14]:
def get_top_n_product_titles(user_id, top_n, df):
    recommended_product_ids = [item[0] for item in top_n.get(user_id, [])]
    product_titles = []

    for product_id in recommended_product_ids:
        product_row = df.filter(df.product_id == product_id).first()
        if product_row is not None:
            product_titles.append(product_row['product_name'])
        else:
            product_titles.append(None)

    return product_titles

In [15]:
# Assuming `user_id` is the ID of the user you're interested in
user_id = 'AHJDB2E42D2O4IUV5IV5HDN75O3Q'
recommended_titles = get_top_n_product_titles(user_id, top_n, merged_df)

# Use a set to keep track of seen product titles
seen_titles = set()

for idx, title in enumerate(recommended_titles, start=1):
    if title is not None and title not in seen_titles:
        print(f"Recommendation {idx}: {title}")
        seen_titles.add(title)
    elif title is None:
        print(f"Recommendation {idx}: No product found")


Recommendation 1: PHILIPS Drip Coffee Maker HD7432/20, 0.6 L, Ideal for 2-7 cups, Black, Medium


In [16]:
# import json
# json.dumps(recommended_titles)

In [17]:
from surprise import accuracy

# Calculate RMSE (Root Mean Squared Error)
svd_rmse = accuracy.rmse(knn_predictions)

# Calculate MAE (Mean Absolute Error)
svd_mae = accuracy.mae(knn_predictions)

# Calculate R-squared
svd_rsquared = accuracy.mse(knn_predictions)

RMSE: 0.2494
MAE:  0.1958
MSE: 0.0622
