In [67]:
from pyspark.sql import SparkSession
from surprise import Reader
from surprise import Dataset

In [68]:
spark = SparkSession.builder.appName("Best Model for Rec").getOrCreate()

In [69]:
df_product = spark.read.json('data/product.json')
product_rating = spark.read.json('data/rating.json')

In [70]:
merged_df = product_rating.join(df_product, 'productId', 'inner')

In [71]:
df_train, df_test = merged_df.randomSplit([0.7, 0.3], seed = 96)
df_train_pandas = df_train.toPandas()
df_test_pandas = df_test.toPandas()

In [72]:
reader = Reader(rating_scale=(1, 5))

In [73]:
data_train = Dataset.load_from_df(df_train_pandas[['userId', 'productId', 'rating']], reader)
data_test = Dataset.load_from_df(df_test_pandas[['userId', 'productId', 'rating']], reader)

In [74]:
trainset = data_train.build_full_trainset()
testset = data_test.build_full_trainset().build_testset()

In [75]:
from surprise.dump import load, dump

In [76]:
dump_path = 'model/knn_model.pkl'
knn_algo = load(dump_path)[1]

In [77]:
dump_path = 'model/svd_model.pkl'
svd_algo = load(dump_path)[1]

In [78]:
dump_path = 'model/svdpp_model.pkl'
svdpp_algo = load(dump_path)[1]

In [79]:
dump_path = 'model/nmf_model.pkl'
nmf_algo = load(dump_path)[1]

In [80]:
dump_path = 'model/clustering_model.pkl'
clustering_algo = load(dump_path)[1]

In [81]:
# Make predictions using the knn model
knn_predictions = knn_algo.test(testset)

In [82]:
# Make predictions using the svd model
svd_predictions = svd_algo.test(testset)

In [83]:
# Make predictions using the svdpp model
svdpp_predictions = svdpp_algo.test(testset)

In [84]:
# Make predictions using the nmf model
nmf_predictions = nmf_algo.test(testset)

In [85]:
# Make predictions using the clustering model
clustering_predictions = clustering_algo.test(testset)

In [86]:
from surprise import accuracy

In [87]:
# Calculate RMSE (Root Mean Squared Error)
print('KNN')
knn_rmse = accuracy.rmse(knn_predictions)
knn_mae = accuracy.mae(knn_predictions)
knn_rsquared = accuracy.mse(knn_predictions)
print('SVD')
svd_rmse = accuracy.rmse(svd_predictions)
svd_mae = accuracy.mae(svd_predictions)
svd_rsquared = accuracy.mse(svd_predictions)
print('SVDpp')
svdpp_rmse = accuracy.rmse(svdpp_predictions)
svdpp_mae = accuracy.mae(svdpp_predictions)
svdpp_rsquared = accuracy.mse(svdpp_predictions)
print('NMF')
nmf_rmse = accuracy.rmse(nmf_predictions)
nmf_mae = accuracy.mae(nmf_predictions)
nmf_rsquared = accuracy.mse(nmf_predictions)
print('Clustering')
clustering_rmse = accuracy.rmse(clustering_predictions)
clustering_mae = accuracy.mae(clustering_predictions)
clustering_rsquared = accuracy.mse(clustering_predictions)

KNN
RMSE: 1.4219
MAE:  1.2215
MSE: 2.0217
SVD
RMSE: 1.4199
MAE:  1.2188
MSE: 2.0160
SVDpp
RMSE: 1.4205
MAE:  1.2206
MSE: 2.0178
NMF
RMSE: 1.5053
MAE:  1.2905
MSE: 2.2659
Clustering
RMSE: 1.4288
MAE:  1.2316
MSE: 2.0414


In [88]:
dump_path_save = 'model/best_model.pkl'
dump(dump_path_save, algo=svd_predictions)