In [21]:
from pyspark.sql import SparkSession
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

In [22]:
spark = SparkSession.builder.appName("KNN rec sys").getOrCreate()

In [23]:
df_product = spark.read.json('data/product.json')
product_rating = spark.read.json('data/rating.json')

In [24]:
merged_df = product_rating.join(df_product, 'productId', 'inner')

In [25]:
df_train, df_test = merged_df.randomSplit([0.7, 0.3], seed = 96)
df_train_pandas = df_train.toPandas()
df_test_pandas = df_test.toPandas()

In [26]:
reader = Reader(rating_scale=(1, 5))

In [27]:
data_train = Dataset.load_from_df(df_train_pandas[['userId', 'productId', 'rating']], reader)
data_test = Dataset.load_from_df(df_test_pandas[['userId', 'productId', 'rating']], reader)

In [28]:
trainset = data_train.build_full_trainset()
testset = data_test.build_full_trainset().build_testset()

In [29]:
algo = KNNBasic(k = 230, sim_options={'user_based': True})
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1216affa280>

In [30]:
from surprise.dump import dump
dump_path = 'model/knn_model.pkl'
dump(dump_path, algo=algo)