In [1]:
import pandas as pd
df1 = pd.read_csv('ratings_Office_Products.csv', header=None)
df1.head()

Unnamed: 0,0,1,2,3
0,A2UESEUCI73CBO,0078800242,5.0,1374192000
1,A3BBNK2R5TUYGV,0113000316,5.0,1359417600
2,A5J78T14FJ5DU,0113000316,3.0,1318723200
3,A2P462UH5L6T57,043928631X,5.0,1356912000
4,A2E0X1MWNRTQF4,0439340039,1.0,1379721600


In [2]:
df1.columns = ['reviewer_ID', 'product_ID', 'rating', 'time']
df1.head()

Unnamed: 0,reviewer_ID,product_ID,rating,time
0,A2UESEUCI73CBO,0078800242,5.0,1374192000
1,A3BBNK2R5TUYGV,0113000316,5.0,1359417600
2,A5J78T14FJ5DU,0113000316,3.0,1318723200
3,A2P462UH5L6T57,043928631X,5.0,1356912000
4,A2E0X1MWNRTQF4,0439340039,1.0,1379721600


In [3]:
df_new = df1.drop(columns=['time'])
df_new = df_new[:100000]
df_new.head()

Unnamed: 0,reviewer_ID,product_ID,rating
0,A2UESEUCI73CBO,0078800242,5.0
1,A3BBNK2R5TUYGV,0113000316,5.0
2,A5J78T14FJ5DU,0113000316,3.0
3,A2P462UH5L6T57,043928631X,5.0
4,A2E0X1MWNRTQF4,0439340039,1.0


In [4]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   reviewer_ID  100000 non-null  object 
 1   product_ID   100000 non-null  object 
 2   rating       100000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.3+ MB


In [5]:
from surprise import Reader, Dataset
reader = Reader()
data = Dataset.load_from_df(df_new, reader)

In [6]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  89228 

Number of items:  6948


In [7]:
# Because we have fewer items than users, it will be more efficient to calculate ite-item similarity.
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [8]:
## Perform a gridsearch with SVD
# ‚è∞ This cell may take several minutes to run
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(data)

In [9]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 1.2723914073926528, 'mae': 0.9988470936660072}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


In [10]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':False})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [11]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([1.38650604, 1.38910161, 1.37891394, 1.3826045 , 1.37715018]))
('test_mae', array([1.10469865, 1.11250995, 1.09947618, 1.10464319, 1.09712713]))
('fit_time', (2.8952009677886963, 2.9715821743011475, 2.9205470085144043, 2.745098114013672, 2.522273063659668))
('test_time', (0.20351123809814453, 0.13600587844848633, 0.12415170669555664, 0.11664199829101562, 0.11822390556335449))
-----------------------
1.3828552540785588


In [12]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':False})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [13]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([1.27247097, 1.2728537 , 1.28247722, 1.27329835, 1.28919569]))
('test_mae', array([1.00378524, 1.00139192, 1.0116631 , 1.00608363, 1.01364357]))
('fit_time', (1.3219177722930908, 1.4558830261230469, 1.3914039134979248, 1.300567865371704, 1.357342004776001))
('test_time', (0.08756709098815918, 0.09607481956481934, 0.08521485328674316, 0.08442115783691406, 0.08548402786254883))


1.2780591838674868

In [14]:
knn_means = KNNWithMeans(sim_options={'name':'pearson', 'user_based':False})
cv_knn_means = cross_validate(knn_means, data, n_jobs=-1)

In [15]:
for i in cv_knn_means.items():
    print(i)

np.mean(cv_knn_means['test_rmse'])

('test_rmse', array([1.37269639, 1.37391482, 1.37312433, 1.38046254, 1.37461704]))
('test_mae', array([1.08679301, 1.08748221, 1.0843079 , 1.09101506, 1.08681062]))
('fit_time', (2.694962978363037, 3.059847831726074, 2.972233772277832, 2.7834360599517822, 2.6024398803710938))
('test_time', (0.21309471130371094, 0.13517475128173828, 0.11961674690246582, 0.12171697616577148, 0.11696696281433105))


1.374963024004142

In [74]:
df_meta = pd.read_csv('Office_Products.csv')
df_meta.head()

Unnamed: 0,title,main_cat,asin
0,Sequential Spelling Level 1 Bundle with Studen...,Office Products,12624861
1,"Mathematics, Applications and Concepts, Course...",Books,78652669
2,Pearson MyHistoryLab Online Access Code for Am...,Office Products,136039847
3,A Pocket for Corduroy,Office Products,140503528
4,Social Entrepreneurship: What Everyone Needs t...,Books,195396332


In [75]:
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315458 entries, 0 to 315457
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   title     315453 non-null  object
 1   main_cat  313475 non-null  object
 2   asin      315458 non-null  object
dtypes: object(3)
memory usage: 7.2+ MB


In [76]:
meta_df = df_meta.rename(columns={"asin" : "product_ID"})
meta_df.head()

Unnamed: 0,title,main_cat,product_ID
0,Sequential Spelling Level 1 Bundle with Studen...,Office Products,12624861
1,"Mathematics, Applications and Concepts, Course...",Books,78652669
2,Pearson MyHistoryLab Online Access Code for Am...,Office Products,136039847
3,A Pocket for Corduroy,Office Products,140503528
4,Social Entrepreneurship: What Everyone Needs t...,Books,195396332


In [77]:
meta_df.isnull().sum()

title            5
main_cat      1983
product_ID       0
dtype: int64

In [78]:
meta_df.dropna(inplace=True)
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313470 entries, 0 to 315457
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       313470 non-null  object
 1   main_cat    313470 non-null  object
 2   product_ID  313470 non-null  object
dtypes: object(3)
memory usage: 9.6+ MB


In [69]:
svd = SVD(n_factors=20, reg_all=0.02)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7cf8f4c100>

In [70]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=4.05509, details={'was_impossible': False})

In [82]:
def product_rater(df_meta, num, main_cat=None):
    reviewer_ID = 1000
    rating_list = []
    while num > 0:
        if main_cat:
            product = df_meta[df_meta['main_cat'].str.contains(main_cat)].sample(1)
        else:
            product = df_meta.sample(1)
        print(product)
        rating = input('How do you rate this product on a scale of 1-5, press n if you have not used :\n')
        if rating == 'n':
            continue
        else:
            rating_one_product = {'reviewer_Id':reviewer_ID,
                                  'product_ID':product['product_ID'].values[0],
                                  'rating':rating}
            rating_list.append(rating_one_product) 
            num -= 1
    return rating_list      

In [83]:
user_rating = product_rater(meta_df, 4, 'Books')

                                                  title main_cat  product_ID
3669  Stuck In The 80's Trivia Challenge 2018 Boxed/...    Books  1531902650
How do you rate this product on a scale of 1-5, press n if you have not used :
3
                                                    title main_cat  product_ID
101986  Kaplan Review 2011 DAT Course Book, Note Cards...    Books  B005EY5U0C
How do you rate this product on a scale of 1-5, press n if you have not used :
4
                                                  title main_cat  product_ID
2724  Austin Mahone 2015 Square 12x12 (Multilingual ...    Books  1465029923
How do you rate this product on a scale of 1-5, press n if you have not used :
5
                                                  title main_cat  product_ID
6285  Gary Patterson's Funny Business 2017 Wall Cale...    Books  168234097X
How do you rate this product on a scale of 1-5, press n if you have not used :
n
                                 title main_cat  product

In [85]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = df_new.append(user_rating, ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df, reader)

ValueError: too many values to unpack (expected 3)