In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.recommendation import ALS 
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

## Q1

In [19]:
spark = SparkSession.builder.appName('Recommender').getOrCreate() 

In [20]:
df = spark.read.json('movies.json')

df.show(5,10)

+-----------+----------+------------+----------+-----+----------+----------+----------+
|helpfulness|product_id|profile_name|    review|score|   summary|      time|   user_id|
+-----------+----------+------------+----------+-----+----------+----------+----------+
|        7/7|B003AI2VGA|  Brian E...|Synopsi...|  3.0|"There ...|1182729600|A141HP4...|
|        4/4|B003AI2VGA|  Grady Harp|THE VIR...|  3.0|Worthwh...|1181952000|A328S9R...|
|       8/10|B003AI2VGA|  Chrissy...|The sce...|  5.0|This mo...|1164844800|A1I7QGU...|
|        1/1|B003AI2VGA|  golgoth...|THE VIR...|  3.0|distant...|1197158400|A1M5405...|
|        1/1|B003AI2VGA|  KerrLin...|Informa...|  3.0|"What's...|1188345600|ATXL536...|
+-----------+----------+------------+----------+-----+----------+----------+----------+
only showing top 5 rows



In [21]:
df = df.select("user_id", "product_id", "score")
df.show()

+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
|A3QYDL5CDNYN66|B003AI2VGA|  2.0|
| AQJVNDW6YZFQS|B003AI2VGA|  1.0|
| AD4CDZK7D31XP|B00006HAXW|  5.0|
|A3Q4S5DFVPB70D|B00006HAXW|  5.0|
|A2P7UB02HAVEPB|B00006HAXW|  5.0|
|A2TX99AZKDK0V7|B00006HAXW|  4.0|
| AFC8IKR407HSK|B00006HAXW|  5.0|
|A1FRPGQYQTAOR1|B00006HAXW|  5.0|
|A1RSDE90N6RSZF|B00006HAXW|  5.0|
|A1OUBOGB5970AO|B00006HAXW|  4.0|
|A3NPHQVIY59Y0Y|B00006HAXW|  5.0|
| AFKMBAY28XO8A|B00006HAXW|  5.0|
| A66KMXH9V7OGU|B00006HAXW|  5.0|
| AFJ27ZV9183B8|B00006HAXW|  5.0|
| AXMKAXC0TR9AW|B00006HAXW|  5.0|
+--------------+----------+-----+
only showing top 20 rows



In [22]:
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_1", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_1", handleInvalid="keep")

als = ALS(maxIter=5, regParam=0.01, userCol="user_1", itemCol="product_1", ratingCol="score")

pipeline = Pipeline(stages=[user_indexer, product_indexer, als])
                                                                        
#splitting data
train_data, test_data = df.randomSplit([0.8, 0.2]) 
  
#Fitting the model 
model = pipeline.fit(train_data)

In [23]:
# Make predictions on the test set
predictions = model.transform(test_data)

predictions = predictions.dropna(subset=["prediction"])

# Show the predictions
predictions.select("user_id", "product_id", "prediction").show()

+--------------+----------+-----------+
|       user_id|product_id| prediction|
+--------------+----------+-----------+
|A1EXJJQMOWNAUY|B00005LKL6|  1.8641249|
|A1MD6MXBHCRC0V|B000067J2J| 0.14754686|
|A1UWNII9AWMZKG|B00006JU7U|  -8.991828|
|A1VCLTAGM5RLND|B000ID1Q02| -21.448586|
|A1VCLTAGM5RLND|B000SVZIJO|  5.6296945|
|A244CRJ2QSVLZ4|6304675771|  1.7157378|
|A244CRJ2QSVLZ4|B000067J2K|  0.8544126|
|A244CRJ2QSVLZ4|B0002J4ZW8| -0.5896113|
|A27PSZX2SE0B51|B00005B7DN|  3.0089607|
|A27PSZX2SE0B51|B000UGBOT0|  3.5650785|
|A27PSZX2SE0B51|B00151QYU8|    4.92058|
|A2J6MMNWUJQUXS|B004HRDMHK|  4.0839305|
|A2NUP9HUH7ZKZW|B00005LL26|   5.084874|
|A2R9J5LULVKF6T|B000GETWF4|  0.9182242|
|A2U4E28ACJ2QPV|B000EQ5V86| 0.92266953|
|A37FUJC2L7DSPT|B00005U8ER| -4.6460977|
|A3E2UG7HM83I40|B00005O439|  5.3166056|
|A3MWMO2IOQDWBJ|B00015HX90|  4.1200333|
|A3TAX19LPEUVEO|B001ILFUD2|  4.0006857|
| AEE3ZMRRP0U4G|B005LXWYKU|-0.15376663|
+--------------+----------+-----------+
only showing top 20 rows



In [24]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 4.752898619163251


In [25]:
spark.stop()

## Q2

In [26]:
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALSModel
from pyspark.context import SparkContext
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics


sc = SparkContext('local') 
spark = SparkSession(sc)




In [27]:
df1 = spark.read.json('movies.json')

df1.show(5,10)

+-----------+----------+------------+----------+-----+----------+----------+----------+
|helpfulness|product_id|profile_name|    review|score|   summary|      time|   user_id|
+-----------+----------+------------+----------+-----+----------+----------+----------+
|        7/7|B003AI2VGA|  Brian E...|Synopsi...|  3.0|"There ...|1182729600|A141HP4...|
|        4/4|B003AI2VGA|  Grady Harp|THE VIR...|  3.0|Worthwh...|1181952000|A328S9R...|
|       8/10|B003AI2VGA|  Chrissy...|The sce...|  5.0|This mo...|1164844800|A1I7QGU...|
|        1/1|B003AI2VGA|  golgoth...|THE VIR...|  3.0|distant...|1197158400|A1M5405...|
|        1/1|B003AI2VGA|  KerrLin...|Informa...|  3.0|"What's...|1188345600|ATXL536...|
+-----------+----------+------------+----------+-----+----------+----------+----------+
only showing top 5 rows



In [28]:
ratings = (df1.select(
        'user_id',
        'product_id',
        'score',
    )
).cache()

In [30]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [33]:
als = ALS(maxIter=2, regParam=0.01, 
          userCol="user_1", itemCol="product_1", ratingCol="score",
          coldStartStrategy="drop",
          implicitPrefs=True)

user_indexer = StringIndexer(inputCol="user_id", outputCol="user_1", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_1", handleInvalid="keep")

pipeline = Pipeline(stages=[user_indexer, product_indexer, als])
model = pipeline.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="score",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.119579268697591
