In [2]:
import os

In [3]:
os.chdir('../')

In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import surprise 
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD
from IESEGRecSys import eval
from surprise import KNNBasic, SVD, CoClustering, Dataset, Reader

read tables

In [5]:
artist_df = pd.read_table('Data-20220224/Data Group Assignment/Artists.dat')
tags_df = pd.read_table('Data-20220224/Data Group Assignment/tags.dat',encoding='latin-1')
user_artists_df = pd.read_table('Data-20220224/Data Group Assignment/user_artists.dat')
user_taggedartists_df = pd.read_table('Data-20220224/Data Group Assignment/user_taggedartists.dat')


In [7]:
user_taggedartists_df.head()

Unnamed: 0,userID_x,artistID,weight,userID_y,tagID,day,month,year
0,2,51,13883,4.0,16.0,1.0,12.0,2007.0
1,2,51,13883,4.0,25.0,1.0,3.0,2007.0
2,2,51,13883,43.0,16.0,1.0,10.0,2010.0
3,2,51,13883,43.0,25.0,1.0,10.0,2010.0
4,2,51,13883,62.0,16.0,1.0,11.0,2009.0


In [6]:
 # merge together to get the full list of artist IDs
user_taggedartists_df=pd.merge(user_artists_df,user_taggedartists_df, on="artistID",how='left').fillna(0)

In [120]:
# drop uneeded columns
user_taggedartists_df = user_taggedartists_df[["userID_x","artistID","tagID"]]
user_taggedartists_df.columns =["userID","artistID","tagID"]

## working on dataset for content based rec sys

In [121]:
#merging the dataset for content-based 
df_content_based = pd.merge(user_taggedartists_df,tags_df,on="tagID",how="left").fillna(0)

In [115]:
#pivoting the table (creating a dummies table)
df_content_based_1 = df_content_based.pivot_table(index="artistID", columns = "tagValue", aggfunc={'tagValue':np.count_nonzero}).fillna(0)

In [135]:
cb= pd.DataFrame(df_content_based_1)

## collaborative filtering data prep

In [138]:
#discretize weights using qcuts 
user_artists_df['weight_quantiles'] = pd.qcut(user_artists_df['weight'],
                           q=[0,.2,.4,.6,.8,1],
                           labels=False,
                           precision=0)
user_artists_df=user_artists_df.drop(["weight"], axis = 1)

## Train test split

In [139]:
# train-test split
train, test = train_test_split(user_artists_df, test_size=0.3, random_state=123)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(train.shape)
print(test.shape)

(64983, 3)
(27851, 3)


In [140]:
# check whether the artisit IDs are all included in the content based matrix 
user_artists_df[-user_artists_df["artistID"].isin(cb.index.values)]

Unnamed: 0,userID,artistID,weight_quantiles


## creating surprise datasets

In [142]:
reader = surprise.Reader(rating_scale=(1, 5))
df_train = surprise.Dataset.load_from_df(train, reader).build_full_trainset()
df_test = list(test.itertuples(index=False, name=None))

## Applying collaborative filtering models

In [129]:
# user-based
options = {'name':'cosine', 'user_based':True}
ub = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)

# item-based
options = {'name':'cosine', 'user_based':False}
ib = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)

# svd
mf = SVD(n_factors=20, biased=False, random_state=42)

# clustering
clust = CoClustering(n_cltr_u=10, n_cltr_i=10, n_epochs=50, random_state=42)


In [130]:
#evaluating the models
models = {"UB_15":ub, "IB_15":ib, "SVD_20":mf, "Clust_10_10":clust}
overview = pd.concat([eval.evaluate(mod.fit(df_train).test(df_test), topn=5, rating_cutoff=4) for mod in models.values()], axis=1)
overview.columns = list(models.keys())
overview

Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  overview = pd.concat([eval.evaluate(mod.fit(df_train).test(df_test), topn=5, rating_cutoff=4) for mod in models.values()], axis=1)


Unnamed: 0,UB_15,IB_15,SVD_20,Clust_10_10
RMSE,1.393631,1.11718,1.217409,1.07496
MAE,1.18393,0.897713,0.968265,0.864092
Recall,0.00018,0.134892,0.080216,0.156115
Precision,0.25,0.976562,0.867704,0.745704
F1,0.000359,0.237042,0.146855,0.25818
NDCG@5,0.866736,0.861017,0.868794,0.860111


In [123]:
print(len(df_content_based_1.columns))
print(len(pd.unique(df_content_based_1.columns)))

9749
9749


## Applying content based rec sys

In [143]:
from sklearn.metrics.pairwise import cosine_similarity
import surprise

# Content Based as a Class
class ContentBased:

    def __init__(self, NN):
        self.NN = NN
        self.fitted = {"content":False, "ratings":False}
        
    def fit(self, content_data):

        self.items = content_data.index.values
        self.item_dim = len(self.items)
        # check for duplicate items
        assert (len(self.items) == len(set(self.items))), "Duplicate items in content data!"

        # compute similarity
        self.matrix = cosine_similarity(content_data.values)
        np.fill_diagonal(self.matrix, 0)
        
        self.matrixNN = self.matrix.copy()

        # filter similarity matrix for NN nearest neighbors (constraint: non-negative similarity)
        for i in range(self.item_dim):
            crit_val = max(-np.sort(-self.matrix[i])[self.NN-1], 0)
            self.matrixNN[i][self.matrixNN[i] < crit_val] = 0.0
        
        self.fitted["content"] = True

    # helper -> transform surprise.trainset.Trainset to pd.DataFrame
    def _trainset2list(self, trainset):
        return pd.DataFrame([(trainset.to_raw_uid(u), trainset.to_raw_iid(i), r) for (u, i, r) in trainset.all_ratings()], columns=["user", "item", "rating"])

    def fit_ratings(self, df):

        if not self.fitted["content"]:
            raise Exception("Fit model on content data!")

        if isinstance(df, surprise.trainset.Trainset):
            df = self._trainset2list(df)
        
        # fix unknown items
        unknown_items = list(set(df["item"]) - set(self.items))
        if len(unknown_items) > 0:
            print(f"Warning {len(unknown_items)} items are not included in content data: {unknown_items}")
        df = df[df["item"].isin(self.items)].reset_index(drop=True)

        # store user data
        self.users = np.unique(df["user"])
        self.user_dim = len(self.users)

        # fix missing items
        missing_items = list(set(self.items) - set(df["item"]))
        if len(missing_items) > 0: 
            fix_df = pd.DataFrame([{"user":np.nan, "item":i, "rating":np.nan} for i in missing_items])
            df = df.append(fix_df).reset_index(drop=True)

        # pivot 
        df_pivot = df.pivot_table(index='user', values='rating', columns='item', dropna=False).reindex(self.users)

        # row-wise (user) average
        self.user_avg = np.array(np.mean(df_pivot, axis=1))
        self.global_mean = np.mean(self.user_avg)

        # center ratings
        df_pivot = df_pivot.sub(self.user_avg, axis=0).fillna(0)

        # predict ratings for each item 
        denom = self.matrixNN.sum(axis=0) # column sums
        self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]

        # replace NA values with mean
        # prediction[np.isnan(prediction)] = self.global_mean

        self.fitted["ratings"] = True
    
    # get predicted value for user-item combination
    def predict(self, user, item, r_ui=None):
        details = {"was_impossible":False}

        # check whether user and item are unknown -> default = global average rating
        if self.knows_user(user) & self.knows_item(item):

            # convert user & item in internal ids
            iid = np.where(self.items == item)[0].item()
            uid = np.where(self.users == user)[0].item()

            # inference prediction
            est = self.prediction[uid, iid]
            
            if np.isnan(est): 
                est = self.global_mean
                details["was_impossible"] = True
            return surprise.Prediction(user, item, r_ui, est, details)
        
        else:
            details["was_impossible"] = True
            details["reason"] = "User or item unknown"
            return surprise.Prediction(user, item, r_ui, self.global_mean, details)

    # predict entire testset
    def test(self, testset):
        if not self.fitted["ratings"]:
            raise Exception("Fit model on ratings data!")
        return [self.predict(user=u,item=i,r_ui=r) for (u,i,r) in testset]

    def knows_user(self, user):
        return user in self.users   

    def knows_item(self, item):
        return item in self.items        

    # get topn most similar items 
    def get_most_similar(self, item, topn=5):

        # get iid
        if self.knows_item(item):
            iid = np.where(self.items == item)[0].item()
        else:
            raise Exception(f"Item {item} unknown ...")
        
        list_iids = (-self.matrix[iid]).argsort()[:topn]
        return self.items[list_iids]

    def get_similarities(self):
        print('Cosine similarities shape: ({}, {}) items x items'.format(self.item_dim, self.item_dim))
        return self.matrix

In [144]:
# init content-based
cb = ContentBased(NN=10)

# fit on content
cb.fit(df_content_based_1)

# fit on train_ratings
cb.fit_ratings(df_train)
    
# predict test ratings
cb_pred = cb.test(df_test)

  self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]


In [145]:
# compute metrics for CB RS
cb_res = eval.evaluate(cb_pred, topn=5, rating_cutoff=4).rename(columns={'value':'Content_based_10'})
cb_res

Unnamed: 0,Content_based_10
RMSE,0.942734
MAE,0.712091
Recall,0.147482
Precision,0.991536
F1,0.256772
NDCG@5,0.869522


In [127]:
#get summary for all models applied
overview = pd.concat([overview, cb_res], axis=1)
overview

Unnamed: 0,UB_15,IB_15,SVD_20,Clust_10_10,Content_based_10
RMSE,1.393631,1.11718,1.217409,1.07496,0.963876
MAE,1.18393,0.897713,0.968265,0.864092,0.730288
Recall,0.00018,0.134892,0.080216,0.156115,0.151079
Precision,0.25,0.976562,0.867704,0.745704,0.927152
F1,0.000359,0.237042,0.146855,0.25818,0.259821
NDCG@5,0.866736,0.861017,0.868794,0.860111,0.870427
