In [1]:
import pandas as pd
df1 = pd.read_csv('ratings_Office_Products.csv', header=None)
df1.head()

Unnamed: 0,0,1,2,3
0,A2UESEUCI73CBO,0078800242,5.0,1374192000
1,A3BBNK2R5TUYGV,0113000316,5.0,1359417600
2,A5J78T14FJ5DU,0113000316,3.0,1318723200
3,A2P462UH5L6T57,043928631X,5.0,1356912000
4,A2E0X1MWNRTQF4,0439340039,1.0,1379721600


In [2]:
df1.columns = ['reviewer_ID', 'product_ID', 'rating', 'time']
df1.head()

Unnamed: 0,reviewer_ID,product_ID,rating,time
0,A2UESEUCI73CBO,0078800242,5.0,1374192000
1,A3BBNK2R5TUYGV,0113000316,5.0,1359417600
2,A5J78T14FJ5DU,0113000316,3.0,1318723200
3,A2P462UH5L6T57,043928631X,5.0,1356912000
4,A2E0X1MWNRTQF4,0439340039,1.0,1379721600


In [3]:
df_new = df1.drop(columns=['time'])
df_new = df_new[:100000]
df_new.head()

Unnamed: 0,reviewer_ID,product_ID,rating
0,A2UESEUCI73CBO,0078800242,5.0
1,A3BBNK2R5TUYGV,0113000316,5.0
2,A5J78T14FJ5DU,0113000316,3.0
3,A2P462UH5L6T57,043928631X,5.0
4,A2E0X1MWNRTQF4,0439340039,1.0


In [4]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   reviewer_ID  100000 non-null  object 
 1   product_ID   100000 non-null  object 
 2   rating       100000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.3+ MB


In [5]:
from surprise import Reader, Dataset
reader = Reader()
data = Dataset.load_from_df(df_new, reader)

In [6]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  89228 

Number of items:  6948


In [7]:
# Because we have fewer items than users, it will be more efficient to calculate ite-item similarity.
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [8]:
## Perform a gridsearch with SVD
# ⏰ This cell may take several minutes to run
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(data)

In [9]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 1.2732907313911717, 'mae': 0.9990535955039508}
{'rmse': {'n_factors': 20, 'reg_all': 0.05}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [10]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':False})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [11]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([1.36988262, 1.38574236, 1.37586112, 1.39506934, 1.38785726]))
('test_mae', array([1.09564739, 1.10920847, 1.09866688, 1.10987819, 1.10511247]))
('fit_time', (3.2491660118103027, 3.3087921142578125, 3.234184980392456, 3.1654438972473145, 2.8686866760253906))
('test_time', (0.20161890983581543, 0.14344191551208496, 0.1315009593963623, 0.11571192741394043, 0.12722301483154297))
-----------------------
1.382882539001401


In [12]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':False})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [13]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([1.29001441, 1.28135614, 1.2693518 , 1.2793492 , 1.26611521]))
('test_mae', array([1.01559608, 1.00977509, 1.00156299, 1.00570272, 0.99954036]))
('fit_time', (1.6098499298095703, 1.5372469425201416, 1.42421293258667, 1.4328482151031494, 1.4387290477752686))
('test_time', (0.09584784507751465, 0.0866689682006836, 0.08576202392578125, 0.08427095413208008, 0.08653998374938965))


1.277237349676632

In [14]:
knn_means = KNNWithMeans(sim_options={'name':'pearson', 'user_based':False})
cv_knn_means = cross_validate(knn_means, data, n_jobs=-1)

In [15]:
for i in cv_knn_means.items():
    print(i)

np.mean(cv_knn_means['test_rmse'])

('test_rmse', array([1.37550515, 1.38052887, 1.37192064, 1.36886352, 1.37964756]))
('test_mae', array([1.0876106 , 1.08931592, 1.0858797 , 1.0832587 , 1.0899445 ]))
('fit_time', (3.4811301231384277, 3.360893964767456, 3.2297019958496094, 3.1215338706970215, 2.9693851470947266))
('test_time', (0.1408097743988037, 0.13854408264160156, 0.14691710472106934, 0.1538712978363037, 0.12755608558654785))


1.3752931490808362

In [36]:
df_meta = pd.read_csv('Office_Products.csv')
df_meta.head()

Unnamed: 0,title,main_cat,asin
0,Sequential Spelling Level 1 Bundle with Studen...,Office Products,12624861
1,"Mathematics, Applications and Concepts, Course...",Books,78652669
2,Pearson MyHistoryLab Online Access Code for Am...,Office Products,136039847
3,A Pocket for Corduroy,Office Products,140503528
4,Social Entrepreneurship: What Everyone Needs t...,Books,195396332


In [37]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315458 entries, 0 to 315457
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   title     315453 non-null  object
 1   main_cat  313475 non-null  object
 2   asin      315458 non-null  object
dtypes: object(3)
memory usage: 7.2+ MB


In [38]:
meta_df = df_meta.rename(columns={"asin" : "product_ID"}, index=False)
meta_df

TypeError: Index(...) must be called with a collection of some kind, False was passed

In [19]:
df_meta.isna().sum()

Unnamed: 0       0
title            5
main_cat      1983
asin             0
dtype: int64

In [20]:
df_meta.dropna(inplace=True)
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313470 entries, 0 to 315457
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  313470 non-null  int64 
 1   title       313470 non-null  object
 2   main_cat    313470 non-null  object
 3   asin        313470 non-null  object
dtypes: int64(1), object(3)
memory usage: 12.0+ MB


In [21]:
svd = SVD(n_factors=20, reg_all=0.05)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd1e0dd5e20>

In [22]:
def product_rater(df_meta, num, main_cat=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if main_cat:
            product = df_meta[df_meta['main_cat'].str.contains(main_cat)].sample(1)
        else:
            product = df_meta.sample(1)
        print(product)
        rating = input('How do you rate this product on a scale of 1-5, press n if you have not used :\n')
        if rating == 'n':
            continue
        else:
            rating_one_product = {'reviewer_ID':reviewerID,'product_ID':product['product_ID'].values[0],'rating':rating}
            rating_list.append(rating_one_product) 
            num -= 1
    return rating_list      


In [31]:
user_rating = product_rater(df_meta, 4, 'Office Products')

ValueError: Cannot mask with non-boolean array containing NA / NaN values