In [1]:
import os

In [2]:
os.chdir('/Users/dimitrikestenbaum/Desktop/RecSys')

In [3]:
import pandas as pd 
import numpy as np 
from IESEGRecSys import eval
from IESEGRecSys.model import ContentBased
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, KNNBasic, SVD
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

## Hybrid Recommender Systems

### Load Data

In [6]:
user_artist_categorized = pd.read_csv('user_artists_categorized.csv')

In [7]:
user_artist_categorized.head()

Unnamed: 0,userID,artistID,weight_quantiles
0,2,51,4
1,2,52,4
2,2,53,4
3,2,54,4
4,2,55,4


### `Train` / `Test` Split

In [33]:
#create train and test sets 
UA_train, UA_test = train_test_split(user_artist_categorized,test_size=0.3,random_state=123)

UA_train = UA_train.reset_index(drop=True)
UA_test = UA_test.reset_index(drop=True)

In [34]:
#create reader object 
reader = Reader(rating_scale=(1,5)) #1:5 scale 

#create surprise train and test set objects
data = Dataset.load_from_df(user_artist_categorized[["userID","artistID","weight_quantiles"]], reader)
UA_train = Dataset.load_from_df(UA_train, reader).build_full_trainset()
UA_test = list(UA_test.itertuples(index=False, name=None))

## Hybrid Recommender #1: Simple Average Predictions of `SVD` and `BaselineOnly` Algos

The reasoning for averaging the predictions of these two methods lies in that they were the two best performing Collaborative Filtering algorithms. This can be seen in the `Collaborative Filtering Models` section of this notebook.

For the SVD model we can use the hyperparameters 

In [37]:
#get baseline KNN score 
from surprise import KNNBasic


svd = SVD(n_factors=20, n_epochs=20, biased=True,random_state=123)

#create cosine similarity matrix
svd.fit(UA_train)\
.compute_similarities()

array([[1.   , 0.5  , 0.2  , ..., 0.   , 0.   , 0.   ],
       [0.5  , 1.   , 0.375, ..., 0.   , 0.   , 1.   ],
       [0.2  , 0.375, 1.   , ..., 0.   , 0.   , 0.5  ],
       ...,
       [0.   , 0.   , 0.   , ..., 1.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.5  , ..., 0.   , 0.   , 1.   ]])

In [38]:
svd_preds = svd.test(UA_test)

In [39]:
models = {"UB_KNN":ub_preds, "SVD":svd_preds}
overview = pd.concat([eval.evaluate(mod, topn=5, rating_cutoff=2) for mod in models.values()], axis=1)
overview.columns = list(models.keys())
overview

Unnamed: 0,UB_KNN,SVD
RMSE,1.394226,0.907108
MAE,1.184893,0.737216
Recall,0.615016,0.784936
Precision,0.708707,0.914353
F1,0.658546,0.844716
NDCG@5,0.866164,0.871942


In [40]:
# Combine predictions (mean)

# extract predictions content-based and item-based
df_pred_ub_KNN, df_pred_svd = pd.DataFrame(ub_preds), pd.DataFrame(svd_preds)

df_hybrid = df_pred_ub_KNN.copy()
df_hybrid['est'] = (np.array(df_pred_ub_KNN['est']) + np.array(df_pred_svd['est'])) / 2

df_hybrid.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,843,10570,0,1.498138,"{'was_impossible': True, 'reason': 'User and/o..."
1,189,492,3,2.57333,"{'actual_k': 15, 'was_impossible': False}"
2,227,543,1,2.129309,"{'actual_k': 20, 'was_impossible': False}"
3,1340,10662,3,2.410101,"{'was_impossible': True, 'reason': 'User and/o..."
4,517,703,2,1.952635,"{'actual_k': 19, 'was_impossible': False}"


In [41]:
eval.evaluate(df_hybrid,topn=5,rating_cutoff=2)

Unnamed: 0,value
RMSE,1.088568
MAE,0.929834
Recall,0.842234
Precision,0.853321
F1,0.847742
NDCG@5,0.864574


In [42]:
df_pred_ub_KNN

Unnamed: 0,uid,iid,r_ui,est,details
0,843,10570,0,1.996276,"{'was_impossible': True, 'reason': 'User and/o..."
1,189,492,3,2.458521,"{'actual_k': 15, 'was_impossible': False}"
2,227,543,1,2.038612,"{'actual_k': 20, 'was_impossible': False}"
3,1340,10662,3,1.996276,"{'was_impossible': True, 'reason': 'User and/o..."
4,517,703,2,2.528017,"{'actual_k': 19, 'was_impossible': False}"
...,...,...,...,...,...
27846,1579,1109,4,2.410597,"{'actual_k': 17, 'was_impossible': False}"
27847,1795,485,2,2.899309,"{'actual_k': 20, 'was_impossible': False}"
27848,1003,159,3,2.947368,"{'actual_k': 19, 'was_impossible': False}"
27849,1323,1378,3,2.181689,"{'actual_k': 16, 'was_impossible': False}"


In [43]:
df_pred_svd.rename(columns={'est':'svd_est','r_ui':'target'},inplace=True)

In [45]:
df_pred_svd = df_pred_svd[['target','svd_est']]

In [46]:
df_pred_svd

Unnamed: 0,target,svd_est
0,0,1.000000
1,3,2.688139
2,1,2.220006
3,3,2.823927
4,2,1.377254
...,...,...
27846,4,3.133586
27847,2,1.810834
27848,3,2.578435
27849,3,2.231235


In [48]:
df_pred_ub_KNN = df_pred_ub_KNN.rename(columns={'est':'ub_KNN'})

In [49]:
df_pred_ub_KNN = df_pred_ub_KNN[['ub_KNN']]

In [57]:
data = pd.concat([df_pred_svd,df_pred_ub_KNN],axis=1)

In [58]:
rf_train, rf_test = train_test_split(data,test_size=0.2,random_state=123)

In [59]:
from sklearn.ensemble import RandomForestRegressor

X, y = rf_train.loc[:,rf_train.columns != 'target'], np.array(rf_train['target'])

# fit random forest model
rf_model = RandomForestRegressor(max_depth=4, n_estimators=100).fit(X,y)

In [63]:
rf_test.drop(columns=['target'],inplace=True)
rf_test.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [67]:
# predict
rf_pred = rf_model.predict(rf_test)

# transform in surprise format
rf_pred

array([3.87586898, 3.08027227, 1.0969231 , ..., 3.05394607, 2.13223222,
       2.81834731])