In [1]:
import pandas as pd
import numpy as np
# from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly
#from surprise import *
from surprise.model_selection.validation import cross_validate

from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import train_test_split

In [2]:
df = pd.read_csv("../DATA/clean_details.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024482 entries, 0 to 1024481
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype
---  ------      --------------    -----
 0   product_id  1024482 non-null  int64
 1   user_id     1024482 non-null  int64
 2   rating      1024482 non-null  int64
dtypes: int64(3)
memory usage: 23.4 MB


In [4]:
#df['product_id'] = df['product_id'].astype('int8')
#df['user_id'] = df['user_id'].astype('int8')
#df['rating'] = df['rating'].astype('float16')

In [5]:
df.head()

Unnamed: 0,product_id,user_id,rating
0,190,1,5
1,190,2,5
2,190,3,5
3,190,4,5
4,190,5,5


In [6]:
n_ratings = len(df)
n_movies = len(df['product_id'].unique())
n_users = len(df['user_id'].unique())

In [7]:
display(n_ratings, n_movies, n_users)

1024482

31267

650636

In [8]:
df['user_id'].value_counts()

user_id
199       19615
159        2585
831        2541
324        2415
860        2088
          ...  
252672        1
252671        1
252670        1
252669        1
650636        1
Name: count, Length: 650636, dtype: int64

In [9]:
df['product_id'].value_counts()

product_id
1731      412
177       395
231       391
17194     389
2359      387
         ... 
121331      1
121330      1
121328      1
121326      1
26899       1
Name: count, Length: 31267, dtype: int64

In [10]:
# https://surprise.readthedocs.io/en/stable/reader.html
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'product_id', 'rating']], reader)

In [11]:
# Define similarity measure
sim_options = {'name': 'cosine', 'user_based': True}

# Split dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Build and train model
#model = KNNBasic(sim_options=sim_options)
model = SVD()
model.fit(trainset)

# Make recommendations for a specific user (replace 'user_id' with the desired user ID)
user_id = 199
# Get items the user hasn't rated yet
items_to_predict = [item_id for item_id in data.build_full_trainset().all_items() if item_id not in trainset.ur[trainset.to_inner_uid(user_id)]]
# Predict ratings for the items
predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
# Sort predictions by estimated rating
top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]

# Print top recommendations
for prediction in top_n:
    print('Item ID:', prediction.iid, 'Estimated rating:', prediction.est)


Item ID: 128 Estimated rating: 5
Item ID: 268 Estimated rating: 5
Item ID: 1017 Estimated rating: 5
Item ID: 1029 Estimated rating: 5
Item ID: 1092 Estimated rating: 5
Item ID: 1230 Estimated rating: 5
Item ID: 1287 Estimated rating: 5
Item ID: 1342 Estimated rating: 5
Item ID: 1614 Estimated rating: 5
Item ID: 1635 Estimated rating: 5


In [12]:
# https://www.youtube.com/watch?v=8wLKuscyO9I
# Singular value decomposition
algorithm = SVD()
#algorithm = KNNBasic()
#Run 5-fold cross-validation and print results
results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8846  0.8768  0.8789  0.8806  0.8785  0.8798  0.0027  
MAE (testset)     0.5606  0.5566  0.5573  0.5584  0.5572  0.5580  0.0014  
Fit time          8.70    8.55    8.94    8.59    8.79    8.71    0.14    
Test time         1.03    0.56    1.03    1.00    1.01    0.93    0.18    


In [13]:
results

{'test_rmse': array([0.88457469, 0.87675962, 0.87885906, 0.88058919, 0.87846046]),
 'test_mae': array([0.56060421, 0.55655786, 0.55727482, 0.55844335, 0.5572296 ]),
 'fit_time': (8.70039701461792,
  8.548640727996826,
  8.93690824508667,
  8.590626001358032,
  8.786329507827759),
 'test_time': (1.0321078300476074,
  0.564140796661377,
  1.0273747444152832,
  1.0025465488433838,
  1.0110280513763428)}

In [14]:
# If the results are OK => getting full dataset => fit model
trainset = data.build_full_trainset()
algorithm.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20d423c8a90>

### Recommender for a specific user

In [15]:
# userId = 27 has seen some movies:
userId = 27
df_select = df[(df['user_id'] == userId) & (df['rating'] >=3)]
df_select = df_select.set_index('product_id')
#df_select = df_select.join(df_title)['Name']
df_select.head(df_select.shape[0])

Unnamed: 0_level_0,user_id,rating
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
190,27,5


In [16]:
df_select.shape

(1, 2)

In [17]:
df_score = df[["product_id"]]

In [18]:
df_score

Unnamed: 0,product_id
0,190
1,190
2,190
3,190
4,190
...,...
1024477,171107
1024478,171107
1024479,171107
1024480,171107


- Once the model has been evaluated to our satisfaction, then we can re-train the model using the entire training dataset

In [19]:
# The following are the top 5 movies to be recommended to the user with userId
# To recommend products (i.e., movies) to the given user,
# we can sort the list of movies in decreasing order of predicted ratings
# and take the top N movies as recommendations:
df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(userId, x).est) # est: get EstimateScore
df_score = df_score.sort_values(by=['EstimateScore'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score['EstimateScore'] = df_score['product_id'].apply(lambda x: algorithm.predict(userId, x).est) # est: get EstimateScore


In [20]:
df_score = df_score.drop_duplicates()

In [21]:
df_score.head()

Unnamed: 0,product_id,EstimateScore
512241,25183,5.0
616431,11147,5.0
847882,1681,5.0
438300,23546,5.0
616359,11146,5.0


In [22]:
df_score[df_score.EstimateScore>=3]

Unnamed: 0,product_id,EstimateScore
512241,25183,5.000000
616431,11147,5.000000
847882,1681,5.000000
438300,23546,5.000000
616359,11146,5.000000
...,...,...
75409,2154,3.033624
951961,17334,3.030076
238190,18553,3.024821
743170,15185,3.012023


In [None]:
!