In [6]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import PredefinedKFold
from collections import defaultdict
from surprise.model_selection.split import LeaveOneOut
import pandas as pd
import numpy as np
import os

In [3]:
df = pd.read_csv("all_33_date.csv")
df = df[['user_id','business_id','stars']]

In [4]:
df.head()

Unnamed: 0,user_id,business_id,stars
0,Ud72j_rglYmXrL_O8zCdoA,LkMtMHVetws5_7QfRjPtlg,4.0
1,Ud72j_rglYmXrL_O8zCdoA,ii8sAGBexBOJoYRFafF9XQ,5.0
2,Ud72j_rglYmXrL_O8zCdoA,ZkGDCVKSdf8m76cnnalL-A,5.0
3,NkVYcoaThHjOfTTgq6OdHQ,Wxxvi3LZbHNIDwJ-ZimtnA,5.0
4,G5LXEaxhQMF_BVjNHkVr7g,6xgcHeLad-VaoTIQewK84A,5.0


In [5]:
df = df.dropna()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 834551 entries, 0 to 834551
Data columns (total 3 columns):
user_id        834551 non-null object
business_id    834551 non-null object
stars          834551 non-null float64
dtypes: float64(1), object(2)
memory usage: 25.5+ MB


In [4]:
df_train = pd.read_csv("train_33_selected.csv")

In [5]:
df_test = pd.read_csv("test_33_selected.csv")

In [6]:
df_train.head()

Unnamed: 0,user_id,business_id,stars
0,Ud72j_rglYmXrL_O8zCdoA,LkMtMHVetws5_7QfRjPtlg,4.0
1,Ud72j_rglYmXrL_O8zCdoA,ii8sAGBexBOJoYRFafF9XQ,5.0
2,Ud72j_rglYmXrL_O8zCdoA,ZkGDCVKSdf8m76cnnalL-A,5.0
3,NkVYcoaThHjOfTTgq6OdHQ,Wxxvi3LZbHNIDwJ-ZimtnA,5.0
4,G5LXEaxhQMF_BVjNHkVr7g,6xgcHeLad-VaoTIQewK84A,5.0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667646 entries, 0 to 667645
Data columns (total 3 columns):
user_id        667646 non-null object
business_id    667646 non-null object
stars          667646 non-null float64
dtypes: float64(1), object(2)
memory usage: 15.3+ MB


In [11]:
len(df_train.user_id.unique())

104156

In [9]:
len(df_train.business_id.unique())

6199

In [10]:
df_test.head()

Unnamed: 0,user_id,business_id,stars
0,Q5L8xuEqGIzfCnHA4FVjmA,YRiQtFNteLUUEiGkdQ23vg,5.0
1,djb61X-vkg5PF16qM_wI5Q,cUpdeZJawIwOl6G21z7CzA,4.0
2,aI9MXPqQm6nJAvrp1dp8AA,tro-hrljuY1cUAVFYauTlQ,5.0
3,dTWh3151qDqnOb_67iGI6w,gtB-1QxTscdPj78UE5sWuQ,5.0
4,baGqmwgON4QcKmjqeQUyMw,WEeMwRLhgCyO1b4kikVcuQ,4.0


In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166905 entries, 0 to 166904
Data columns (total 3 columns):
user_id        166905 non-null object
business_id    166905 non-null object
stars          166905 non-null float64
dtypes: float64(1), object(2)
memory usage: 3.8+ MB


In [12]:
len(df_test.user_id.unique())

104147

In [13]:
len(df_test.business_id.unique())

6193

In [14]:
reader = Reader(rating_scale=(1, 5))
data_train = Dataset.load_from_df(df_train, reader)
data_test = Dataset.load_from_df(df_test, reader)
data_all = Dataset.load_from_df(df, reader)

In [15]:
df_trainset_train = data_train.build_full_trainset()
df_trainset_test = df_trainset_train.build_testset()

In [16]:
df_testset = data_test.build_full_trainset().build_testset()

# Modeling

#### BaselineOnly

In [42]:
from surprise import BaselineOnly

bsl_options = {'method': 'sgd',
               'reg': 0.001,
               'learning_rate': .005,
               }

algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(df_trainset_train)

train_pred = algo.test(df_trainset_test)
print("BaselineOnly train biased RMSE", accuracy.rmse(train_pred))
test_pred = algo.test(df_testset)
print("BaselineOnly test unbiased RMSE", accuracy.rmse(test_pred))

Estimating biases using sgd...
RMSE: 1.0615
BaselineOnly train biased RMSE 1.0615374793532382
RMSE: 1.2057
BaselineOnly test unbiased RMSE 1.2057112268351573


In [50]:
from surprise import BaselineOnly

bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 8,
               'reg_i': 15
               }

algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(df_trainset_train)

train_pred = algo.test(df_trainset_test)
print("BaselineOnly train biased RMSE", accuracy.rmse(train_pred))
test_pred = algo.test(df_testset)
print("BaselineOnly test unbiased RMSE", accuracy.rmse(test_pred))

Estimating biases using als...
RMSE: 1.0665
BaselineOnly train biased RMSE 1.0665105660993361
RMSE: 1.2108
BaselineOnly test unbiased RMSE 1.2107896707675672


In [69]:
cross_validate(algo, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1418  1.1417  1.1469  1.1448  1.1427  1.1436  0.0020  
MAE (testset)     0.9056  0.9049  0.9095  0.9063  0.9063  0.9065  0.0016  
Fit time          4.12    4.26    4.40    4.40    4.28    4.29    0.10    
Test time         1.76    0.82    0.83    0.81    0.81    1.01    0.38    


{'fit_time': (4.122597932815552,
  4.2642810344696045,
  4.399949789047241,
  4.399378061294556,
  4.276902914047241),
 'test_mae': array([0.90558792, 0.90488165, 0.90954911, 0.90627441, 0.90634098]),
 'test_rmse': array([1.14175314, 1.14171772, 1.14693228, 1.14478315, 1.14269284]),
 'test_time': (1.7621209621429443,
  0.8171501159667969,
  0.8267278671264648,
  0.8119058609008789,
  0.8108701705932617)}

In [24]:
loo = LeaveOneOut()

algo = BaselineOnly()

for trainset, testset in loo.split(data_all):
    
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
RMSE: 1.2156
Estimating biases using als...
RMSE: 1.2125
Estimating biases using als...
RMSE: 1.2211
Estimating biases using als...
RMSE: 1.2176
Estimating biases using als...
RMSE: 1.2145


#### KNN

In [1]:
from surprise import KNNBaseline
algo_knn = KNNBaseline(k=500, user_based=False)
algo_knn.fit(df_trainset_train)

NameError: name 'df_trainset_train' is not defined

In [None]:
train_pred_knn = algo_knn.test(df_trainset_test)
print("KNN train biased RMSE", accuracy.rmse(train_pred_knn))
test_pred_knn = algo_knn.test(df_testset)
print("KNN test unbiased RMSE", accuracy.rmse(test_pred_knn))

In [None]:
#cross_validate(algo_knn, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

#### PMF

In [38]:
from surprise import SVD
algo_pmf = SVD(n_factors=12, n_epochs=20, biased=False)
algo_pmf.fit(df_trainset_train)

train_pred_pmf = algo_pmf.test(df_trainset_test)
print("PMF train biased RMSE", accuracy.rmse(train_pred_pmf))
test_pred_pmf = algo_pmf.test(df_testset)
print("PMF test unbiased RMSE", accuracy.rmse(test_pred_pmf))

RMSE: 0.9462
PMF train biased RMSE 0.9462048849220677
RMSE: 1.5357
PMF test unbiased RMSE 1.5356683095858437


In [39]:
cross_validate(algo_pmf, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3664  1.3728  1.3646  1.3655  1.3725  1.3684  0.0036  
MAE (testset)     1.0549  1.0607  1.0534  1.0550  1.0594  1.0567  0.0028  
Fit time          15.57   15.76   15.75   15.73   15.74   15.71   0.07    
Test time         1.30    1.28    0.89    0.86    0.85    1.04    0.21    


{'fit_time': (15.566722869873047,
  15.762358903884888,
  15.749221086502075,
  15.731513023376465,
  15.742002964019775),
 'test_mae': array([1.05493793, 1.06067975, 1.05337667, 1.05501464, 1.05939546]),
 'test_rmse': array([1.36638166, 1.37284885, 1.36459553, 1.3655026 , 1.3724832 ]),
 'test_time': (1.299198865890503,
  1.2790918350219727,
  0.8946449756622314,
  0.8617708683013916,
  0.8522119522094727)}

In [26]:
from surprise import SVD
algo_pmf = SVD(biased=False)

for trainset, testset in loo.split(data_all):
    
    algo_pmf.fit(trainset)
    predictions = algo_pmf.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.5246
RMSE: 1.5259
RMSE: 1.5192
RMSE: 1.5190
RMSE: 1.5192


#### SVD

In [33]:
from surprise import SVD
algo_svd_1 = SVD(n_factors=20, n_epochs=15)
algo_svd_1.fit(df_trainset_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x115560c88>

In [34]:
train_pred_svd_1 = algo_svd_1.test(df_trainset_test)
print("SVD train biased RMSE", accuracy.rmse(train_pred_svd_1))
test_pred_svd_1 = algo_svd_1.test(df_testset)
print("SVD test unbiased RMSE", accuracy.rmse(test_pred_svd_1))

RMSE: 1.0249
SVD train biased RMSE 1.024856211461352
RMSE: 1.2118
SVD test unbiased RMSE 1.2117590546524613


In [23]:
from surprise import SVD
algo_svd_2 = SVD(n_factors=50, n_epochs=10)
algo_svd_2.fit(df_trainset_train)

train_pred_svd_2 = algo_svd_2.test(df_trainset_test)
print("SVD train biased RMSE", accuracy.rmse(train_pred_svd_2))
test_pred_svd_2 = algo_svd_2.test(df_testset)
print("SVD test unbiased RMSE", accuracy.rmse(test_pred_svd_2))

RMSE: 1.0388
SVD train biased RMSE 1.0387571058602665
RMSE: 1.2199
SVD test unbiased RMSE 1.2199389887459124


In [28]:
from surprise import SVD
algo_svd_3 = SVD(n_factors=30, n_epochs=10)
algo_svd_3.fit(df_trainset_train)

train_pred_svd_3 = algo_svd_3.test(df_trainset_test)
print("SVD train biased RMSE", accuracy.rmse(train_pred_svd_3))
test_pred_svd_3 = algo_svd_3.test(df_testset)
print("SVD test unbiased RMSE", accuracy.rmse(test_pred_svd_3))

RMSE: 1.0608
SVD train biased RMSE 1.0608437830508364
RMSE: 1.2187
SVD test unbiased RMSE 1.2186500127989746


In [30]:
from surprise import SVD
algo_svd_4 = SVD(n_factors=30, n_epochs=15)
algo_svd_4.fit(df_trainset_train)

train_pred_svd_4 = algo_svd_4.test(df_trainset_test)
print("SVD train biased RMSE", accuracy.rmse(train_pred_svd_4))
test_pred_svd_4 = algo_svd_4.test(df_testset)
print("SVD test unbiased RMSE", accuracy.rmse(test_pred_svd_4))

RMSE: 0.9611
SVD train biased RMSE 0.9610708138026886
RMSE: 1.2151
SVD test unbiased RMSE 1.2150919274110736


In [35]:
cross_validate(algo_svd_1, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1482  1.1487  1.1477  1.1494  1.1472  1.1482  0.0008  
MAE (testset)     0.9133  0.9121  0.9130  0.9142  0.9124  0.9130  0.0007  
Fit time          14.69   16.53   15.14   14.76   15.44   15.31   0.67    
Test time         1.25    1.12    1.15    1.12    1.12    1.15    0.05    


{'fit_time': (14.690735816955566,
  16.53309202194214,
  15.13542103767395,
  14.755640983581543,
  15.436825037002563),
 'test_mae': array([0.91330799, 0.91208184, 0.91299981, 0.91418933, 0.91238658]),
 'test_rmse': array([1.14818215, 1.1487161 , 1.14769818, 1.14935646, 1.14718293]),
 'test_time': (1.246934175491333,
  1.125,
  1.1499409675598145,
  1.116776943206787,
  1.117764949798584)}

In [None]:
algo_svd = SVD()

for trainset, testset in loo.split(data_all):
    
    algo_svd.fit(trainset)
    predictions = algo_svd.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

#### SVDpp

In [56]:
from surprise import SVDpp
algo_svdpp_1 = SVDpp(n_factors=10, n_epochs=10)
algo_svdpp_1.fit(df_trainset_train)

train_pred_svdpp_1 = algo_svdpp_1.test(df_trainset_test)
print("SVDpp train biased RMSE", accuracy.rmse(train_pred_svdpp_1))
test_pred_svdpp_1 = algo_svdpp_1.test(df_testset)
print("SVDpp test unbiased RMSE", accuracy.rmse(test_pred_svdpp_1))

RMSE: 1.0514
SVDpp train biased RMSE 1.051354400669383
RMSE: 1.2118
SVDpp test unbiased RMSE 1.2117677323392348


In [58]:
from surprise import SVDpp
algo_svdpp_2 = SVDpp(n_factors=5, n_epochs=5)
algo_svdpp_2.fit(df_trainset_train)

train_pred_svdpp_2 = algo_svdpp_2.test(df_trainset_test)
print("SVDpp train biased RMSE", accuracy.rmse(train_pred_svdpp_2))
test_pred_svdpp_2 = algo_svdpp_2.test(df_testset)
print("SVDpp test unbiased RMSE", accuracy.rmse(test_pred_svdpp_2))

RMSE: 1.1077
SVDpp train biased RMSE 1.1076843252153576
RMSE: 1.2254
SVDpp test unbiased RMSE 1.2254464718146858


In [60]:
from surprise import SVDpp
algo_svdpp = SVDpp()
algo_svdpp.fit(df_trainset_train)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1033e8ef0>

In [61]:
train_pred_svdpp = algo_svdpp.test(df_trainset_test)
print("SVDpp train biased RMSE", accuracy.rmse(train_pred_svdpp))
test_pred_svdpp = algo_svdpp.test(df_testset)
print("SVDpp test unbiased RMSE", accuracy.rmse(test_pred_svdpp))

RMSE: 0.8178
SVDpp train biased RMSE 0.8177596675789183
RMSE: 1.2224
SVDpp test unbiased RMSE 1.222432032057506


In [59]:
cross_validate(algo_svdpp_1, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1457  1.1449  1.1483  1.1488  1.1483  1.1472  0.0016  
MAE (testset)     0.9121  0.9110  0.9132  0.9139  0.9136  0.9128  0.0011  
Fit time          137.76  154.74  164.27  160.81  159.68  155.45  9.36    
Test time         6.79    8.01    7.91    10.56   7.72    8.20    1.26    


{'fit_time': (137.75702691078186,
  154.73514199256897,
  164.268061876297,
  160.80781507492065,
  159.68211698532104),
 'test_mae': array([0.91207069, 0.91103457, 0.91324639, 0.91389616, 0.91358074]),
 'test_rmse': array([1.14566271, 1.14489693, 1.14832405, 1.14878058, 1.1483142 ]),
 'test_time': (6.7922258377075195,
  8.010616779327393,
  7.907413005828857,
  10.5589120388031,
  7.715319871902466)}

In [30]:
from surprise import SVDpp

algo_svdpp = SVDpp()

for trainset, testset in loo.split(data_all):
    
    algo_svdpp.fit(trainset)
    predictions = algo_svdpp.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.2270


KeyboardInterrupt: 

#### NMF

In [54]:
from surprise import NMF

algo_nmf_1 = NMF(n_factors=500,n_epochs=10,biased=True)
algo_nmf_1.fit(df_trainset_train)

train_pred_nmf_1 = algo_nmf_1.test(df_trainset_test)
print("NMF train biased RMSE", accuracy.rmse(train_pred_nmf_1))
test_pred_nmf_1 = algo_nmf_1.test(df_testset)
print("NMF test unbiased RMSE", accuracy.rmse(test_pred_nmf_1))

RMSE: 1.8887
NMF train biased RMSE 1.8886954324179221
RMSE: 2.0244
NMF test unbiased RMSE 2.024350988002533


In [53]:
from surprise import NMF

algo_nmf_2 = NMF(n_factors=3,n_epochs=5,biased=True)
algo_nmf_2.fit(df_trainset_train)

train_pred_nmf_2 = algo_nmf_2.test(df_trainset_test)
print("NMF train biased RMSE", accuracy.rmse(train_pred_nmf_2))
test_pred_nmf_2 = algo_nmf_2.test(df_testset)
print("NMF test unbiased RMSE", accuracy.rmse(test_pred_nmf_2))

RMSE: 1.0423
NMF train biased RMSE 1.0422730231469668
RMSE: 1.2333
NMF test unbiased RMSE 1.2332936708221804


In [52]:
from surprise import NMF

algo_nmf_3 = NMF(n_factors=2,n_epochs=5,biased=True)
algo_nmf_3.fit(df_trainset_train)

train_pred_nmf_3 = algo_nmf_3.test(df_trainset_test)
print("NMF train biased RMSE", accuracy.rmse(train_pred_nmf_3))
test_pred_nmf_3 = algo_nmf_3.test(df_testset)
print("NMF test unbiased RMSE", accuracy.rmse(test_pred_nmf_3))

RMSE: 1.0171
NMF train biased RMSE 1.0171494921695154
RMSE: 1.2401
NMF test unbiased RMSE 1.2401192122252256


In [58]:
from surprise import NMF

algo_nmf = NMF()
algo_nmf.fit(df_trainset_train)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1033e5748>

In [59]:
train_pred_nmf = algo_nmf.test(df_trainset_test)
print("NMF train biased RMSE", accuracy.rmse(train_pred_nmf))
test_pred_nmf = algo_nmf.test(df_testset)
print("NMF test unbiased RMSE", accuracy.rmse(test_pred_nmf))

RMSE: 0.6135
NMF train biased RMSE 0.6135013087584554
RMSE: 1.3930
NMF test unbiased RMSE 1.3929995422178636


In [55]:
cross_validate(algo_nmf_2, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1725  1.1729  1.1715  1.1734  1.1682  1.1717  0.0018  
MAE (testset)     0.9237  0.9249  0.9243  0.9249  0.9206  0.9237  0.0016  
Fit time          4.38    6.95    8.13    6.42    10.26   7.23    1.94    
Test time         1.48    1.75    1.68    2.13    1.73    1.75    0.21    


{'fit_time': (4.378977060317993,
  6.947607040405273,
  8.1286461353302,
  6.4235310554504395,
  10.257351160049438),
 'test_mae': array([0.92372323, 0.92493659, 0.92429811, 0.92490481, 0.92059365]),
 'test_rmse': array([1.17247172, 1.17287832, 1.17151232, 1.17341319, 1.16824885]),
 'test_time': (1.4765958786010742,
  1.7464051246643066,
  1.6839540004730225,
  2.1256210803985596,
  1.7344229221343994)}

In [29]:
from surprise import NMF

algo_nmf = NMF()

for trainset, testset in loo.split(data_all):
    
    algo_nmf.fit(trainset)
    predictions = algo_nmf.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.3859
RMSE: 1.3855
RMSE: 1.3893
RMSE: 1.3844
RMSE: 1.3880


#### CoClustering

In [44]:
from surprise import CoClustering

algo_clu_2 = CoClustering(n_cltr_u=3, n_cltr_i=2, n_epochs=5)
algo_clu_2.fit(df_trainset_train)

train_pred_clu_2 = algo_clu_2.test(df_trainset_test)
print("CoClustering train biased RMSE", accuracy.rmse(train_pred_clu_2))
test_pred_clu_2 = algo_clu_2.test(df_testset)
print("CoClustering test unbiased RMSE", accuracy.rmse(test_pred_clu_2))

RMSE: 0.9696
CoClustering train biased RMSE 0.969627115190611
RMSE: 1.3367
CoClustering test unbiased RMSE 1.3367309507289586


In [45]:
from surprise import CoClustering

algo_clu_1 = CoClustering(n_cltr_u=2, n_cltr_i=2, n_epochs=5)
algo_clu_1.fit(df_trainset_train)

train_pred_clu_1 = algo_clu_1.test(df_trainset_test)
print("CoClustering train biased RMSE", accuracy.rmse(train_pred_clu_1))
test_pred_clu_1 = algo_clu_1.test(df_testset)
print("CoClustering test unbiased RMSE", accuracy.rmse(test_pred_clu_1))

RMSE: 0.9823
CoClustering train biased RMSE 0.9822603928968215
RMSE: 1.3331
CoClustering test unbiased RMSE 1.3330527088008828


In [41]:
from surprise import CoClustering

algo_clu_3 = CoClustering(n_cltr_u=2, n_cltr_i=3, n_epochs=10)
algo_clu_3.fit(df_trainset_train)

train_pred_clu_3 = algo_clu_3.test(df_trainset_test)
print("CoClustering train biased RMSE", accuracy.rmse(train_pred_clu_3))
test_pred_clu_3 = algo_clu_3.test(df_testset)
print("CoClustering test unbiased RMSE", accuracy.rmse(test_pred_clu_3))

RMSE: 0.9813
CoClustering train biased RMSE 0.9813254915818097
RMSE: 1.3329
CoClustering test unbiased RMSE 1.3328625920689292


In [55]:
from surprise import CoClustering

algo_clu = CoClustering()
algo_clu.fit(df_trainset_train)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x1451aa1d0>

In [56]:
train_pred_clu = algo_clu.test(df_trainset_test)
print("CoClustering train biased RMSE", accuracy.rmse(train_pred_clu))
test_pred_clu = algo_clu.test(df_testset)
print("CoClustering test unbiased RMSE", accuracy.rmse(test_pred_clu))

RMSE: 0.9677
CoClustering train biased RMSE 0.9677336945010659
RMSE: 1.3402
CoClustering test unbiased RMSE 1.3401557418150414


In [47]:
cross_validate(algo_clu_3, data_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2743  1.2712  1.2698  1.2734  1.2712  1.2720  0.0016  
MAE (testset)     0.9498  0.9486  0.9482  0.9507  0.9490  0.9492  0.0009  
Fit time          15.92   15.75   21.61   15.73   13.82   16.57   2.64    
Test time         3.08    1.01    2.13    0.90    0.89    1.60    0.88    


{'fit_time': (15.916110038757324,
  15.751596212387085,
  21.609081983566284,
  15.73489499092102,
  13.816364049911499),
 'test_mae': array([0.94981774, 0.9485647 , 0.9481505 , 0.95067871, 0.94895787]),
 'test_rmse': array([1.27425011, 1.27124372, 1.26976499, 1.27342349, 1.27121491]),
 'test_time': (3.0831899642944336,
  1.0070009231567383,
  2.1267809867858887,
  0.8989031314849854,
  0.8862819671630859)}

In [28]:
from surprise import CoClustering

algo_clu = CoClustering()

for trainset, testset in loo.split(data_all):
    
    algo_clu.fit(trainset)
    predictions = algo_clu.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.3439
RMSE: 1.3483
RMSE: 1.3454
RMSE: 1.3469
RMSE: 1.3460


#### Ensemble

In [61]:
test = pd.DataFrame(test_pred)

In [62]:
test['est_cluster'] = pd.DataFrame(test_pred_clu_3)['est']
test['est_svd'] = pd.DataFrame(test_pred_svd_1)['est']
test['est_svdpp'] = pd.DataFrame(test_pred_svdpp_1)['est']
test['est_nmf'] = pd.DataFrame(test_pred_nmf_2)['est']
#test['est_knn'] = pd.DataFrame(test_pred_knn)['est']

In [63]:
test['est_mean'] = test[['est','est_cluster','est_svd','est_svdpp','est_nmf']].mean(axis=1)

In [68]:
test.to_csv("test_est.csv")

In [64]:
train = pd.DataFrame(train_pred)

In [67]:
train.to_csv("train_est.csv")

In [65]:
train['est_cluster'] = pd.DataFrame(train_pred_clu_3)['est']
train['est_svd'] = pd.DataFrame(train_pred_svd_1)['est']
train['est_svdpp'] = pd.DataFrame(train_pred_svdpp_1)['est']
train['est_nmf'] = pd.DataFrame(train_pred_nmf_2)['est']
#train['est_knn'] = pd.DataFrame(train_pred_knn)['est']

In [66]:
train['est_mean'] = train[['est','est_cluster','est_svd','est_svdpp','est_nmf']].mean(axis=1)

In [69]:
from sklearn.metrics import mean_squared_error
from math import sqrt

train_rmse = sqrt(mean_squared_error(train['r_ui'], train['est_mean']))
test_rmse = sqrt(mean_squared_error(test['r_ui'], test['est_mean']))

In [70]:
print("ensemble train rmse", train_rmse)
print("ensemble test rmse", test_rmse)

ensemble train rmse 1.0115591563326694
ensemble test rmse 1.2078211477272622


### Blending

In [3]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def train_test_model_performance(clf, X_train, y_train, X_test, y_test):
    # Fit a model by providing X and y from training set
    clf.fit(X_train, y_train)

    # Make prediction on the training data
    y_train_pred = clf.predict(X_train)

    # Make predictions on test data
    y_test_pred = clf.predict(X_test)
    
    train_rmse = sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = sqrt(mean_squared_error(y_test, y_test_pred))

    return train_rmse, test_rmse

In [83]:
#X_train = train[['est','est_cluster','est_svd','est_svdpp','est_nmf']]
#y_train = train['r_ui']
from sklearn.model_selection import train_test_split

X = test[['est','est_cluster','est_svd','est_svdpp','est_nmf']]
y = test['r_ui']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [84]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

train_rmse, test_rmse = train_test_model_performance(reg, X_train, y_train, X_test, y_test)

print('train rmse', train_rmse)
print('test rmse', test_rmse)

train rmse 1.2040537215312836
test rmse 1.2046226769340245


In [87]:
from sklearn.linear_model import Ridge

rig = Ridge(alpha=10)

train_rmse, test_rmse = train_test_model_performance(rig, X_train, y_train, X_test, y_test)

print('train rmse', train_rmse)
print('test rmse', test_rmse)

train rmse 1.2040540469888974
test rmse 1.2046244824045316


In [86]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=20, max_depth=7, min_samples_leaf=10, max_features=3)

train_rmse, test_rmse = train_test_model_performance(rf, X_train, y_train, X_test, y_test)

print('train rmse', train_rmse)
print('test rmse', test_rmse)

train rmse 1.1945114072313194
test rmse 1.2022756054693744


#### LightGBM

In [41]:
import datetime
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import haversine

ImportError: No module named 'haversine'