In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.estointernet.in/apache/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf /content/spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
 
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


In [None]:
!curl -L http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz -o data.json.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2402k  100 2402k    0     0   561k      0  0:00:04  0:00:04 --:--:--  561k


In [None]:
# %fs ls "file:/databricks/driver"

In [None]:
df = spark.read.json('data.json.gz')
df.show()

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [None]:
# only select the necessary rows
df = df.select('asin','overall','reviewText','reviewerID')
# Convert unique strings to integer using StringIndexer
asin_indexer = StringIndexer(inputCol="asin", outputCol="itemId")
reviewerID_indexer = StringIndexer(inputCol="reviewerID", outputCol="userId")

df = asin_indexer.fit(df).transform(df)
df = reviewerID_indexer.fit(df).transform(df)

In [None]:
df = df.select('userId','itemId','reviewText','overall')
df = df.withColumnRenamed('overall', 'rating')

df.show()

+------+------+--------------------+------+
|userId|itemId|          reviewText|rating|
+------+------+--------------------+------+
|  66.0| 703.0|Not much to write...|   5.0|
| 266.0| 703.0|The product does ...|   5.0|
| 395.0| 703.0|The primary job o...|   5.0|
|1048.0| 703.0|Nice windscreen p...|   5.0|
|1311.0| 703.0|This pop filter i...|   5.0|
|  51.0| 562.0|So good that I bo...|   5.0|
| 290.0| 562.0|I have used monst...|   5.0|
| 374.0| 562.0|I now use this ca...|   3.0|
|  13.0| 562.0|Perfect for my Ep...|   5.0|
| 183.0| 562.0|Monster makes the...|   5.0|
|   4.0| 562.0|Monster makes a w...|   5.0|
| 488.0| 704.0|I got it to have ...|   4.0|
| 699.0| 704.0|If you are not us...|   3.0|
|  49.0| 704.0|I love it, I used...|   5.0|
| 594.0| 704.0|I bought this to ...|   5.0|
| 317.0| 704.0|I bought this to ...|   2.0|
| 104.0| 455.0|This Fender cable...|   4.0|
| 250.0| 455.0|wanted it just on...|   5.0|
|   3.0| 455.0|I've been using t...|   5.0|
|  29.0| 455.0|Fender cords look

In [None]:
df = df.withColumn('userId', col('userId').cast('integer')).\
        withColumn('itemId', col('itemId').cast('integer')).\
        withColumn('rating', col('rating').cast('integer')).\
        drop('reviewText')

df.show()

+------+------+------+
|userId|itemId|rating|
+------+------+------+
|    66|   703|     5|
|   266|   703|     5|
|   395|   703|     5|
|  1048|   703|     5|
|  1311|   703|     5|
|    51|   562|     5|
|   290|   562|     5|
|   374|   562|     3|
|    13|   562|     5|
|   183|   562|     5|
|     4|   562|     5|
|   488|   704|     4|
|   699|   704|     3|
|    49|   704|     5|
|   594|   704|     5|
|   317|   704|     2|
|   104|   455|     4|
|   250|   455|     5|
|     3|   455|     5|
|    29|   455|     5|
+------+------+------+
only showing top 20 rows



In [None]:
# Create test and train set
(train, test) = df.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="userId", itemCol="itemId", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")

# Add hyperparameters values to 
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [5,10,25,50]) \
            .addGrid(als.regParam, [0.1,0.01,0.001]) \
            .addGrid(als.maxIter, [20]).build()

print (f"Num models to be tested: {len(param_grid)}", )

# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 

Num models to be tested: 12


In [None]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

#Fit cross validator to the 'train' dataset
model = cv.fit(train)

#Extract best model from the cv model above
best_model = model.bestModel

In [None]:
print(model)

CrossValidatorModel_cc987197393c


In [None]:
# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 50
  MaxIter: 20
  RegParam: 0.1


In [None]:
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

1.0832932031755984


In [None]:
test_predictions.show()

+------+------+------+----------+
|userId|itemId|rating|prediction|
+------+------+------+----------+
|   650|   148|     3|  4.117756|
|  1181|   148|     5| 3.5417786|
|   835|   148|     5| 5.0145617|
|  1117|   148|     5| 4.2076006|
|  1164|   463|     5| 4.7153826|
|   391|   463|     5| 4.0762634|
|    25|   471|     4| 3.8581061|
|  1208|   496|     4| 3.1904829|
|   663|   496|     5| 4.9089046|
|    24|   496|     5|  5.194099|
|    61|   833|     5|  4.258645|
|  1128|   243|     5|  3.939932|
|   525|   243|     4| 1.2749126|
|   277|   392|     5| 4.3395176|
|   300|   540|     5|  3.255546|
|   164|   540|     4| 3.1406229|
|   967|   540|     5| 3.4332495|
|   675|   623|     2|  3.422467|
|    63|   623|     5|  4.987586|
|   383|   858|     4| 2.4083784|
+------+------+------+----------+
only showing top 20 rows

