# SVD++ on Movielens Dataset

## Preparation

In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import sys, os
sys.path.append(os.path.abspath("../"))

import pandas as pd

from core.utils import load_algo, load_cv, load_gscv
from api.model import recommend_top_k

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

## Model Training

In [4]:
gs = load_gscv("gs_svdpp")

gs_df = pd.DataFrame(gs.cv_results)

abbr_gs_df = gs_df.drop(columns=["mean_fit_time", "std_fit_time", "mean_test_time", "std_test_time", "params"])

abbr_gs_df.sort_values(by="rank_test_rmse", ascending=True, inplace=False)[0:10]

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,mean_test_mae,std_test_mae,rank_test_mae,param_n_factors,param_n_epochs
0,0.864887,0.880981,0.86052,0.853487,0.874007,0.853962,0.867508,0.867223,0.863026,0.855989,0.864159,0.008368,1,0.661112,0.67226,0.655735,0.655025,0.667694,0.652707,0.658314,0.660849,0.664103,0.65369,0.660149,0.006057,1,10,30
1,0.861799,0.884222,0.868145,0.854583,0.873559,0.851737,0.874506,0.868129,0.861344,0.857809,0.865583,0.009535,2,0.656073,0.676955,0.659415,0.654254,0.667645,0.6504,0.665916,0.661189,0.663205,0.6536,0.660865,0.00753,2,15,30
2,0.872465,0.876223,0.867043,0.857584,0.873722,0.855937,0.869811,0.87174,0.86116,0.8601,0.866579,0.006939,3,0.662949,0.669957,0.661512,0.657612,0.669772,0.651354,0.659055,0.663175,0.663239,0.656472,0.66151,0.005451,3,20,30
3,0.86307,0.883448,0.862809,0.855871,0.882036,0.859367,0.873543,0.875195,0.862917,0.864983,0.868324,0.009083,4,0.659232,0.675414,0.657151,0.656962,0.674913,0.656777,0.662114,0.664593,0.663211,0.66047,0.663084,0.006552,4,25,30
4,0.870796,0.884083,0.871962,0.861554,0.87709,0.852597,0.883196,0.872537,0.862723,0.861579,0.869812,0.009651,5,0.666742,0.674809,0.665369,0.661747,0.669499,0.652385,0.670769,0.664449,0.663707,0.659002,0.664848,0.006003,5,30,30


In [4]:
gs = load_gscv("gs_svdpp_full")

gs_df = pd.DataFrame(gs.cv_results)

abbr_gs_df = gs_df.drop(columns=["mean_fit_time", "std_fit_time", "mean_test_time", "std_test_time", "params"])

abbr_gs_df.sort_values(by="rank_test_rmse", ascending=True, inplace=False)[0:10]

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,mean_test_mae,std_test_mae,rank_test_mae,param_n_factors,param_n_epochs
1,0.861361,0.880067,0.861574,0.852406,0.868493,0.850579,0.868326,0.864367,0.857889,0.853524,0.861859,0.008519,1,0.658336,0.67356,0.656493,0.651789,0.665667,0.648189,0.660739,0.657286,0.662005,0.651905,0.658597,0.007041,1,11,30
3,0.858438,0.882853,0.863508,0.854463,0.871949,0.847198,0.867896,0.863455,0.856123,0.854399,0.862028,0.009767,2,0.656093,0.675463,0.656378,0.658306,0.66674,0.646033,0.662361,0.655968,0.659574,0.65376,0.659068,0.007504,2,13,30
0,0.866854,0.876573,0.863618,0.853099,0.867666,0.849814,0.871666,0.870364,0.851586,0.859638,0.863088,0.008741,3,0.6621,0.670113,0.659809,0.654781,0.664181,0.648522,0.662678,0.661606,0.657207,0.656348,0.659734,0.005598,3,10,30
2,0.863191,0.878442,0.867387,0.856615,0.871561,0.848661,0.871397,0.864252,0.854714,0.862226,0.863845,0.008436,4,0.658731,0.671304,0.660126,0.657122,0.666413,0.65019,0.661949,0.657024,0.658098,0.658559,0.659952,0.005416,4,12,30
4,0.866449,0.879427,0.864143,0.859821,0.871437,0.848155,0.873839,0.867988,0.865091,0.861222,0.865757,0.008115,5,0.65828,0.671234,0.656577,0.657622,0.666089,0.648652,0.66374,0.658522,0.665234,0.656676,0.660262,0.006052,5,14,30


In [4]:
gs = load_gscv("gs_svdpp_reg")

gs_df = pd.DataFrame(gs.cv_results)

abbr_gs_df = gs_df.drop(columns=["mean_fit_time", "std_fit_time", "mean_test_time", "std_test_time", "params"])

abbr_gs_df.sort_values(by="rank_test_rmse", ascending=True, inplace=False)[0:10]

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,mean_test_mae,std_test_mae,rank_test_mae,param_n_factors,param_n_epochs,param_reg_all
2,0.850419,0.868655,0.847337,0.843734,0.861193,0.835849,0.861719,0.852287,0.845969,0.84747,0.851463,0.009294,1,0.651454,0.666362,0.646932,0.647867,0.661674,0.638964,0.657459,0.64967,0.652295,0.647895,0.652057,0.007517,1,11,30,0.05
3,0.859141,0.872553,0.859322,0.846489,0.865926,0.846742,0.865597,0.862149,0.850779,0.849678,0.857838,0.008561,2,0.659857,0.670474,0.658105,0.651148,0.666042,0.649386,0.662905,0.657033,0.656921,0.652303,0.658417,0.006338,2,11,30,0.1
1,0.864552,0.877304,0.866668,0.851474,0.874105,0.850279,0.868808,0.861276,0.858302,0.858468,0.863124,0.008471,3,0.65955,0.669437,0.661768,0.653385,0.668544,0.649526,0.660743,0.656033,0.661504,0.654148,0.659464,0.006073,3,11,30,0.02
0,0.87247,0.892705,0.886563,0.861089,0.884948,0.858936,0.882002,0.879771,0.874112,0.877776,0.877037,0.010212,4,0.666735,0.680572,0.673252,0.657593,0.675972,0.656155,0.67245,0.669029,0.672474,0.668868,0.66931,0.007238,4,11,30,0.01


In [None]:
gs = load_gscv("gs_svdpp_reg_full")

gs_df = pd.DataFrame(gs.cv_results)

abbr_gs_df = gs_df.drop(columns=["mean_fit_time", "std_fit_time", "mean_test_time", "std_test_time", "params"])

abbr_gs_df.sort_values(by="rank_test_rmse", ascending=True, inplace=False)[0:10]

## Model Evaluation

In [3]:
cv_dict = load_cv("cv_svdpp")

cv_df = pd.DataFrame(cv_dict, index=(f"Fold {k}" for k in range(1, 11)))

cv_df.loc["Mean"] = cv_df.mean(axis=0)
cv_df.loc["Std"] = cv_df.std(axis=0)

cv_df.drop(columns=["fit_time", "test_time"]).transpose()

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Fold 6,Fold 7,Fold 8,Fold 9,Fold 10,Mean,Std
test_rmse,0.860447,0.872668,0.861618,0.85679,0.87082,0.849782,0.872113,0.866461,0.855228,0.862088,0.862802,0.007295
test_mae,0.658655,0.66936,0.657299,0.656406,0.666814,0.647455,0.663625,0.654969,0.658269,0.659039,0.659189,0.00591
test_prec,0.712614,0.698475,0.696766,0.698909,0.69982,0.70405,0.70889,0.705502,0.683477,0.701791,0.701029,0.007513
test_rec,0.698362,0.680976,0.679601,0.696713,0.685081,0.684686,0.693309,0.68101,0.673372,0.688546,0.686166,0.007618
test_f1,0.68343,0.664729,0.664685,0.675148,0.667424,0.667985,0.676611,0.668511,0.655419,0.670279,0.669422,0.007272
