In [1]:
from pyspark.sql import SparkSession
from surprise import Reader
from surprise import Dataset
from surprise import SVD

In [2]:
spark = SparkSession.builder.appName("SVD rec sys").getOrCreate()

In [3]:
df_product = spark.read.json('data/product.json')
product_rating = spark.read.json('data/rating.json')

In [4]:
merged_df = product_rating.join(df_product, 'productId', 'inner')

In [5]:
df_train, df_test = merged_df.randomSplit([0.7, 0.3], seed = 96)
df_train_pandas = df_train.toPandas()
df_test_pandas = df_test.toPandas()

In [6]:
reader = Reader(rating_scale=(1, 5))

In [7]:
data_train = Dataset.load_from_df(df_train_pandas[['userId', 'productId', 'rating']], reader)
data_test = Dataset.load_from_df(df_test_pandas[['userId', 'productId', 'rating']], reader)

In [8]:
trainset = data_train.build_full_trainset()
testset = data_test.build_full_trainset().build_testset()

In [9]:
algo = SVD(n_epochs=0, lr_all=0.005, reg_all=0.1)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22c6bd96700>

In [10]:
from surprise.dump import dump
dump_path = 'model/svd_model.pkl'
dump(dump_path, algo=algo)