#### Q1: Demonstrate how to load a dataset suitable for recommendation systems into a PySpark DataFrame.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import findspark
findspark.init()
spark = SparkSession.builder.getOrCreate()

ratings = spark.read.json("movies 1.json").select("user_id","product_id","score").cache()
ratings = ratings.head(10000)
ratings = spark.createDataFrame(ratings)

ratings.show(5)



+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
+--------------+----------+-----+
only showing top 5 rows



#### Q2: Implement a PySpark script that splits the data and trains a recommendation model.

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings)
    for column in ["user_id", "product_id"]
]

pipeline = Pipeline(stages=indexers)
ratings_indexed = pipeline.fit(ratings).transform(ratings)

training_data,validation_data = ratings_indexed.randomSplit([8.0,2.0])

als = ALS(userCol="user_id_index",itemCol="product_id_index",ratingCol="score",rank=10,maxIter=5,regParam=0.01,coldStartStrategy="drop")
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

model = als.fit(training_data)
predictions=model.transform(validation_data)
predictions.show(10,False)

+--------------+----------+-----+-------------+----------------+-----------+
|user_id       |product_id|score|user_id_index|product_id_index|prediction |
+--------------+----------+-----+-------------+----------------+-----------+
|A2FRKEXDXDN1KI|B000063W1R|4.0  |31.0         |7.0             |10.497304  |
|ADX5JX5LKLC22 |B000063W1R|5.0  |580.0        |7.0             |4.989737   |
|A2FEGRJQNU51P9|B000063W1R|4.0  |384.0        |7.0             |-1.6689513 |
|A328S9RN3U5M68|B003AI2VGA|3.0  |6.0          |144.0           |-23.98982  |
|A13TO1ZFAH9SVN|B000063W1R|5.0  |235.0        |7.0             |-3.2107456 |
|A1J03J0HZ7KU5T|B008FPU7AA|2.0  |127.0        |112.0           |-0.73620385|
|A3OI841P5R6FCH|B000063W1R|4.0  |523.0        |7.0             |1.8565565  |
|AQ01Q3070LT29 |B000063W1R|1.0  |38.0         |7.0             |3.3005426  |
|A27RJ30RN5K9MX|B000063W1R|5.0  |145.0        |7.0             |-3.7193491 |
|A9Q28YTLYREO7 |B004BH1TN0|4.0  |200.0        |127.0           |0.9626541  |

#### Q3: Implement a PySpark script using the ALS algorithm for collaborative filtering.

In [3]:
user1 = validation_data.filter(validation_data['user_id_index']==1.0).select(['product_id','user_id','user_id_index','product_id_index'])
user1.show()
recommendations = model.transform(user1) 
recommendations.orderBy('prediction',ascending=False).show()

+----------+-------------+-------------+----------------+
|product_id|      user_id|user_id_index|product_id_index|
+----------+-------------+-------------+----------------+
|B0001EYSQC|ANCOMAI0I7LVG|          1.0|            31.0|
|B00004RXMK|ANCOMAI0I7LVG|          1.0|            62.0|
|B001AQT0VI|ANCOMAI0I7LVG|          1.0|            15.0|
|B005UYF7KY|ANCOMAI0I7LVG|          1.0|            50.0|
+----------+-------------+-------------+----------------+

+----------+-------------+-------------+----------------+-----------+
|product_id|      user_id|user_id_index|product_id_index| prediction|
+----------+-------------+-------------+----------------+-----------+
|B00004RXMK|ANCOMAI0I7LVG|          1.0|            62.0|   8.902909|
|B0001EYSQC|ANCOMAI0I7LVG|          1.0|            31.0|-0.68762636|
|B001AQT0VI|ANCOMAI0I7LVG|          1.0|            15.0| -10.798508|
|B005UYF7KY|ANCOMAI0I7LVG|          1.0|            50.0|  -14.21423|
+----------+-------------+-------------+-----

#### Q4: Implement code to evaluate the performance of the recommendation model using appropriate metrics.

In [4]:
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# Additional Evaluation Metric: Mean Absolute Error (MAE)
evaluator_mae = RegressionEvaluator(
    metricName="mae",
    labelCol="score",
    predictionCol="prediction"
)

mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE) = {mae}")

Root Mean Squared Error (RMSE) = 5.7522237109984875
Mean Absolute Error (MAE) = 4.264309283829248
