In [125]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import surprise 
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD
from IESEGRecSys import eval
from surprise import KNNBasic, SVD, CoClustering, Dataset, Reader

In [83]:
artist_df = pd.read_table('Data/Data Group Assignment/Artists.dat')
tags_df = pd.read_table('Data/Data Group Assignment/tags.dat',encoding='latin-1')
user_artists_df = pd.read_table('Data/Data Group Assignment/user_artists.dat')
user_taggedartists_df = pd.read_table('Data/Data Group Assignment/user_taggedartists.dat')


In [84]:
user_artists_df.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [86]:
tags_df.head()

Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternative metal
2,3,goth rock
3,4,black metal
4,5,death metal


In [87]:
user_taggedartists_df

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009
...,...,...,...,...,...,...
186474,2100,16437,4,1,7,2010
186475,2100,16437,292,1,5,2010
186476,2100,16437,2087,1,7,2010
186477,2100,16437,2801,1,5,2010


In [88]:
#merging the dataset for content-based 
df_content_based = pd.merge(user_taggedartists_df,tags_df,on="tagID",how="left")

In [89]:
#number of genres
df_content_based["tagValue"].nunique()

9749

In [90]:
# Creating dummy variables:
df_content_based_1 = pd.get_dummies(df_content_based,prefix='', prefix_sep="" , columns=['tagValue'])
df_content_based_1 = df_content_based_1.drop(['userID', 'artistID','day','month','year'], axis=1)
df_content_based_1 = df_content_based_1.set_index("tagid")
df_content_based_1.head()

Unnamed: 0,userID,artistID,tagID,day,month,year,'80s,-pearl fashion music,0 play yet,00,...,zikirli,zmiel pierogi,zmierzch,zombie,zombie rave,zombieland,zoocore,zornish,ztt,zu
0,2,52,13,1,4,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,52,15,1,4,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,52,18,1,4,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,52,21,1,4,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,52,41,1,4,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
#discretize weights using qcuts 
user_artists_df['weight_quantiles'] = pd.qcut(user_artists_df['weight'],
                           q=[0,.2,.4,.6,.8,1],
                           labels=False,
                           precision=0)
user_artists_df=user_artists_df.drop(["weight"], axis = 1)

In [93]:
# train-test split
train, test = train_test_split(user_artists_df, test_size=0.3, random_state=123)

# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(train.shape)
print(test.shape)

(64983, 3)
(27851, 3)


In [94]:
reader = surprise.Reader(rating_scale=(1, 5))
df_train = surprise.Dataset.load_from_df(train, reader).build_full_trainset()
df_test = list(test.itertuples(index=False, name=None))

In [95]:
# user-based
options = {'name':'cosine', 'user_based':True}
ub = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)

# item-based
options = {'name':'cosine', 'user_based':False}
ib = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)

# svd
mf = SVD(n_factors=20, biased=False, random_state=42)

# clustering
clust = CoClustering(n_cltr_u=10, n_cltr_i=10, n_epochs=50, random_state=42)

models = {"UB_15":ub, "IB_15":ib, "SVD_20":mf, "Clust_10_10":clust}
overview = pd.concat([eval.evaluate(mod.fit(df_train).test(df_test), topn=5, rating_cutoff=4) for mod in models.values()], axis=1)
overview.columns = list(models.keys())
overview

Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  overview = pd.concat([eval.evaluate(mod.fit(df_train).test(df_test), topn=5, rating_cutoff=4) for mod in models.values()], axis=1)


Unnamed: 0,UB_15,IB_15,SVD_20,Clust_10_10
RMSE,1.393631,1.11718,1.217409,1.07496
MAE,1.18393,0.897713,0.968265,0.864092
Recall,0.00018,0.134892,0.080216,0.156115
Precision,0.25,0.976562,0.867704,0.745704
F1,0.000359,0.237042,0.146855,0.25818
NDCG@5,0.866736,0.861017,0.868794,0.860111


In [132]:
print(len(df_content_based_1.columns))
print(len(pd.unique(df_content_based_1.columns)))

9748
9748


In [139]:
df_content_based_1

Unnamed: 0_level_0,'80s,-pearl fashion music,0 play yet,00,00's,007,00s,00s rock,1,1008,...,zikirli,zmiel pierogi,zmierzch,zombie,zombie rave,zombieland,zoocore,zornish,ztt,zu
tagID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2087,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
df_content_based_1.groupby('tagID').count().

Unnamed: 0_level_0,'80s,-pearl fashion music,0 play yet,00,00's,007,00s,00s rock,1,1008,...,zikirli,zmiel pierogi,zmierzch,zombie,zombie rave,zombieland,zoocore,zornish,ztt,zu
tagID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1729,1729,1729,1729,1729,1729,1729,1729,1729,1729,...,1729,1729,1729,1729,1729,1729,1729,1729,1729,1729
2,212,212,212,212,212,212,212,212,212,212,...,212,212,212,212,212,212,212,212,212,212
3,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22
4,301,301,301,301,301,301,301,301,301,301,...,301,301,301,301,301,301,301,301,301,301
5,582,582,582,582,582,582,582,582,582,582,...,582,582,582,582,582,582,582,582,582,582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12643,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
12644,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
12645,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
12646,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6


In [116]:
# init content-based
cb = ContentBased(NN=10)

# fit on content
cb.fit(df_content_based_1)

# fit on train_ratings
cb.fit_ratings(df_train)
    
# predict test ratings
cb_pred = cb.test(df_test)

AssertionError: Duplicate items in content data!

In [None]:
# compute metrics
cb_res = eval.evaluate(cb_pred, topn=5, rating_cutoff=4).rename(columns={'value':'Content_based_10'})
cb_res