In [281]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.util import Saveable
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator

In [121]:
spark = SparkSession.builder\
                    .appName("Recommend")\
                    .config("spark.driver.memory", "10g") \
                    .config("spark.executor.cores", "4")  \
                    .getOrCreate()

In [124]:
df = spark.read.csv('./data/user_item_rating.csv', header=True, inferSchema=True)

In [125]:
df.show(5)

+---+-------+----------+------+--------------------+-----------------+----+--------------------+--------------------+-----------------+
|_c0|user_id|      ISBN|rating|               title|           author|year|           publisher|             img_url|number_of_ratings|
+---+-------+----------+------+--------------------+-----------------+----+--------------------+--------------------+-----------------+
|  0| 277427|002542730X|    10|Politically Corre...|James Finn Garner|1994|John Wiley &amp; ...|http://images.ama...|               82|
|  1|   3363|002542730X|     0|Politically Corre...|James Finn Garner|1994|John Wiley &amp; ...|http://images.ama...|               82|
|  2|  11676|002542730X|     6|Politically Corre...|James Finn Garner|1994|John Wiley &amp; ...|http://images.ama...|               82|
|  3|  12538|002542730X|    10|Politically Corre...|James Finn Garner|1994|John Wiley &amp; ...|http://images.ama...|               82|
|  4|  13552|002542730X|     0|Politically Corre

In [126]:
df_1 = df['user_id','ISBN', 'rating']

In [127]:
df_1.show(3)

+-------+----------+------+
|user_id|      ISBN|rating|
+-------+----------+------+
| 277427|002542730X|    10|
|   3363|002542730X|     0|
|  11676|002542730X|     6|
+-------+----------+------+
only showing top 3 rows



In [128]:
indexer = StringIndexer(inputCol="ISBN", outputCol="item_id")
indexed_df = indexer.fit(df_1).transform(df_1)

In [129]:
indexed_df.show(4)

+-------+----------+------+-------+
|user_id|      ISBN|rating|item_id|
+-------+----------+------+-------+
| 277427|002542730X|    10|  167.0|
|   3363|002542730X|     0|  167.0|
|  11676|002542730X|     6|  167.0|
|  12538|002542730X|    10|  167.0|
+-------+----------+------+-------+
only showing top 4 rows



In [130]:
print("{} rows and {} columns".format(indexed_df.count(), len(indexed_df.columns)))

59850 rows and 4 columns


In [131]:
indexed_df.select("ISBN").distinct().count()

2144

In [132]:
indexed_df.select("item_id").distinct().count()

2144

In [133]:
from pyspark.sql.functions import max, min
indexed_df.agg(max("item_id")).collect()[0][0]

2143.0

In [134]:
indexed_df.agg(min("item_id")).collect()[0][0]


0.0

In [135]:
(training, test) = indexed_df.randomSplit([0.8, 0.2], 12)
training.show(4)
print(type(training))

+-------+----------+------+-------+
|user_id|      ISBN|rating|item_id|
+-------+----------+------+-------+
|    254|0060930535|     0|   40.0|
|    254|014028009X|     0|   76.0|
|    254|0140298479|     0|  225.0|
|    254|014100018X|     0|  354.0|
+-------+----------+------+-------+
only showing top 4 rows

<class 'pyspark.sql.dataframe.DataFrame'>


## Base Model

In [165]:
sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

In [166]:
base = ALS(maxIter=15, regParam=0.05, rank=10, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)
model = base.fit(training)

In [175]:
# Information about base model
print("Model rank: ", model.rank)
print("Max iterations: ", model._java_obj.parent().getMaxIter())
print("Regularization: ", model._java_obj.parent().getRegParam())

Model rank:  10
Max iterations:  15
Regularization:  0.05


In [169]:
# Defind evaluator as RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [176]:
# RMSE for training and test set
predictions_train = model.transform(training)
rmse_train = evaluator.evaluate(predictions_train)

predictions_test = model.transform(test)
rmse_test = evaluator.evaluate(predictions_test)

print("Training RMSE: {}".format(rmse_train))
print("Test RMSE: {}".format(rmse_test))

Training RMSE: 2.2000412469887647
Test RMSE: 3.994471876214622


In [177]:
predictions_test.show()

+-------+----------+------+-------+-----------+
|user_id|      ISBN|rating|item_id| prediction|
+-------+----------+------+-------+-----------+
|  15957|042516098X|     0|  171.0|  0.6356151|
|  69042|0440207622|     0|  486.0|   2.387381|
|  69042|0440211727|     0|    6.0|  1.2596714|
|  69042|0440222656|     0|    8.0| 0.23687434|
|  69042|0446359866|     0|  183.0|   1.249915|
|  69042|0451172817|     0|  249.0| 0.60790545|
|  69042|0451176464|     0|  363.0|  2.0440311|
|  69042|0515107476|     0| 1134.0|        0.0|
|  69042|0553250531|     0|  506.0| 0.39798325|
|  69042|055357230X|     0|  471.0|0.121275134|
|  69042|0553579606|     0|  177.0|  0.7368578|
|  69042|0743412028|     0|  251.0| 0.19793731|
| 178199|014028009X|    10|   76.0|  6.8059416|
| 178199|0312422156|    10|  671.0|   5.079855|
| 178199|0440241073|     7|   58.0|  4.1862593|
| 178199|0553572997|     0|   75.0|    1.24648|
| 178199|0743467523|     6|  687.0|   3.086973|
| 178199|0971880107|     0|    0.0| 0.85

### In base model:
Model rank:  10

Max iterations:  15

Regularizati: 0.05

Training RMSE: 2.2000412469887647

Test RMSE: 3.99447187621462205

## Tuning model

In this tuning model, we will use ParamGridBuilder with a set in setting param likes rank, maxIter, regParam to find best model in these setup

In [382]:
# Create ALS model
model = ALS(userCol="user_id", itemCol="item_id", ratingCol="rating",
            coldStartStrategy="drop", nonnegative=True)

In [383]:
# Defind evaluator as RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [384]:
# tuning model using ParamGridBuilder
param_grid = ParamGridBuilder()\
             .addGrid(model.rank, [25])\
             .addGrid(model.maxIter, [75])\
             .addGrid(model.regParam, [.2])\
             .build()

In [385]:
cv = CrossValidator(estimator=model,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5, seed=12)

In [386]:
#training.cache()
indexed_df.cache()

DataFrame[user_id: int, ISBN: string, rating: int, item_id: double]

In [387]:
model = cv.fit(indexed_df)

In [388]:
best_model = model.bestModel

In [389]:
# Information about best model in paramGrid
print(best_model.rank)
print(best_model._java_obj.parent().getMaxIter())
print(best_model._java_obj.parent().getRegParam())

25
75
0.2


In [390]:
# Assuming 'model' is your ALS model and 'test_data' is your test dataset
predictions_train = model.transform(training)
rmse_train = evaluator.evaluate(predictions_train)

predictions_test = model.transform(test)
rmse_test = evaluator.evaluate(predictions_test)

whole_data = model.transform(indexed_df)
rmse_data = evaluator.evaluate(whole_data)

print("Training RMSE: {}".format(rmse_train))
print("Test RMSE: {}".format(rmse_test))
print("Whole data RMSE: {}".format(rmse_data))

Training RMSE: 1.9962611320572226
Test RMSE: 2.0279782768253387
Whole data RMSE: 2.0025643456156415


In [391]:
whole_data.agg(max("prediction"), min("prediction")).collect()

[Row(max(prediction)=11.84021282196045, min(prediction)=0.0)]

In [392]:
whole_data.show(20)

+-------+----------+------+-------+----------+
|user_id|      ISBN|rating|item_id|prediction|
+-------+----------+------+-------+----------+
| 268622|0060930535|     0|   40.0|  1.877364|
| 268622|0140067477|     0|  253.0| 1.3679516|
| 268622|014029628X|    10|  197.0| 5.5658565|
| 178199|0142001740|    10|   18.0| 5.9912796|
| 268622|0142001740|     0|   18.0|  2.314445|
| 178199|0316776963|     0|   73.0| 1.3170755|
| 268622|0316776963|     7|   73.0| 2.8950043|
|  69042|0345361792|     0|   33.0|0.47691157|
| 178199|0345361792|     0|   33.0| 1.7482406|
| 178199|0345452534|     8|  851.0| 7.5790524|
| 178199|0312422156|    10|  671.0| 7.0181437|
| 178199|0380710218|     0|  242.0|  3.354216|
|  69042|044022165X|     0|   41.0| 0.1382077|
| 178199|0965739228|     0| 1398.0|       0.0|
|  15957|0385504209|     0|    4.0| 1.1461346|
| 178199|0385504209|     8|    4.0| 7.7976418|
| 268622|0385504209|     7|    4.0| 4.3794026|
|  69042|0425113884|     0|  676.0|0.24661899|
|  26583|0440

In [393]:
train_rmse = []
for param, value in zip(model.getEstimatorParamMaps(), model.avgMetrics):
    print(f"Parameters: {param}, RMSE: {value}")
    train_rmse.append(value)

Parameters: {Param(parent='ALS_011767cde641', name='rank', doc='rank of the factorization'): 25, Param(parent='ALS_011767cde641', name='maxIter', doc='max number of iterations (>= 0).'): 75, Param(parent='ALS_011767cde641', name='regParam', doc='regularization parameter (>= 0).'): 0.2}, RMSE: 3.4770753440457987


In [394]:

#best_model.save("./model/model_1")

In [395]:
# 25 50 1
userRecs = best_model.recommendForAllUsers(10)

In [396]:
userRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|   3757|[{1393, 8.9770355...|
|   4017|[{1119, 19.08024}...|
|   6323|[{799, 8.468501},...|
|   6543|[{1464, 9.208646}...|
|   6563|[{343, 10.4078865...|
|   7158|[{1649, 11.315558...|
|   7286|[{30, 10.410205},...|
|   7346|[{1393, 11.627357...|
|   8067|[{1464, 15.991763...|
|   8681|[{1649, 11.151564...|
|  10447|[{1464, 10.841313...|
|  11601|[{1290, 2.8491478...|
|  12538|[{1464, 10.066278...|
|  12824|[{1506, 13.687327...|
|  13552|[{1690, 16.76156}...|
|  13850|[{1464, 21.081884...|
|  14422|[{944, 10.133333}...|
|  15408|[{1229, 5.6796255...|
|  15957|[{1671, 12.341031...|
|  16634|[{1764, 14.174701...|
+-------+--------------------+
only showing top 20 rows



In [397]:
usr = indexed_df.select('user_id').distinct().count()

In [398]:
item = indexed_df.select('item_id').distinct().count()

In [399]:
rating = indexed_df.select('rating').count()

In [400]:
sparity = (1 - rating/(usr * item))*100
sparity

96.85640631302945