![](https://www.ieseg.fr/wp-content/uploads/IESEG-Logo-2012-rgb.jpg)


# Content Based

In [61]:
import pandas as pd 
import numpy as np 
from surprise import KNNBasic, SVD, CoClustering, Dataset, Reader, accuracy
from IESEGRecSys.eval import evaluate
from IESEGRecSys import eval
from IESEGRecSys.model import ContentBased
from sklearn.model_selection import train_test_split
import datetime

In [27]:
# Import additional movie data
movie = pd.read_csv('u.item', sep='|', header=None, encoding='latin-1')
movie.columns = ['id','title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film_Noir','Horror','Musical','Mystery','Romance','Sci_Fi','Thriller','War','Western']
movie.head()

Unnamed: 0,id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [47]:
movie.columns

Index(['id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
       'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western',
       'very_old', 'old', 'new', 'very_new'],
      dtype='object')

In [28]:
# Import user-rating matrix
data = pd.read_csv('u.data', sep='\t', header=None)
data.columns = ['user', 'item', 'rating', 'timestamp']
data = data[['user', 'item', 'rating']]
data.head()

Unnamed: 0,user,item,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


# Exercise 1

- Create a feature matrix for all movies in the dataset
- Matrix should contain:
    - binary features indicating the genre
    - binary features indicating the release date (i.e. "very old", "old", "new", "very new")

In [29]:

movie['release_date'] = pd.to_datetime(movie['release_date'])
thresholdMin = pd.to_datetime('1922-01-01')
thresholdVeryOld = pd.to_datetime('1941-01-01') # represents the range from 1922 - 1941 
thresholdOld = pd.to_datetime('1960-01-01')# represents the range from 1941 - 1960 
thresholdNew = pd.to_datetime('1979-01-01')# represents the range from 1960 - 1979 
thresholdVeryNew = pd.to_datetime('1998-01-01')# represents the range from 1979 - 1998 

In [30]:
movie['very_old'] = ((movie.release_date >= thresholdMin) & (movie.release_date < thresholdVeryOld)).astype(int)
movie['old'] = ((movie.release_date >= thresholdVeryOld) & (movie.release_date < thresholdOld)).astype(int)
movie['new'] = ((movie.release_date >= thresholdOld) & (movie.release_date < thresholdNew)).astype(int)
movie['very_new'] = ((movie.release_date >= thresholdNew) & (movie.release_date < thresholdVeryNew)).astype(int)

In [31]:
movie.head()

Unnamed: 0,id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Mystery,Romance,Sci_Fi,Thriller,War,Western,very_old,old,new,very_new
0,1,Toy Story (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,2,GoldenEye (1995),1995-01-01,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1
2,3,Four Rooms (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,4,Get Shorty (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,Copycat (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


# Exercise 2

- split the user-item rating matrix in train (70%) and test (30%)
- fit and evaluate user-based CF, item-based CF, matrix factorization and co-clustering models

In [75]:
userTrain, userTest = train_test_split(data,test_size=0.3,random_state=123)

userTrain = userTrain.reset_index(drop=True)
userTest = userTest.reset_index(drop=True)


In [76]:
reader = Reader(rating_scale=(1,5))

df_train = Dataset.load_from_df(userTrain,reader).build_full_trainset()
df_test = list(userTest.itertuples(index=False, name=None))

In [34]:

def KNN_CF(k,min_k,measure,variation,rando_seed,train,test):
    options = {'name':measure,'user_based':variation}
    global KNNBasic
    global accuracy
    KNN = KNNBasic(k=k,min_k=min_k,options=options,random_state=rando_seed)
    KNN.fit(train)
    predsKNN = KNN.test(test)
    Testaccuracy = accuracy.rmse(predsKNN)
    print(f"KNN RMSE of: ",Testaccuracy)
    return(predsKNN)

## KNN UB

In [35]:
KNN_UB = KNN_CF(k=10,min_k=2,measure='pearson',variation=False,rando_seed=123,train=df_train,test=df_test)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0018
KNN RMSE of:  1.0017854230664607


## KNN UI

In [36]:
KNN_UI = KNN_CF(k=10,min_k=2,measure='cosine',variation=True,rando_seed=123,train=df_train,test=df_test)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0018
KNN RMSE of:  1.0017854230664607


In [37]:
evalKNN_Ub = evaluate(prediction=KNN_UB, topn=5, rating_cutoff=4, excl_impossible=True)
evalKNN_Ub

Excluded 133 (30000) samples. 29867 remaining ...
Excluded 133 (30000) samples. 29867 remaining ...


Unnamed: 0,value
RMSE,0.997612
MAE,0.786492
Recall,0.395534
Precision,0.794934
F1,0.528235
NDCG@5,0.882152


In [38]:
evalKNN_Ui = evaluate(prediction=KNN_UI, topn=5, rating_cutoff=4, excl_impossible=True)
evalKNN_Ui

Excluded 133 (30000) samples. 29867 remaining ...
Excluded 133 (30000) samples. 29867 remaining ...


Unnamed: 0,value
RMSE,0.997612
MAE,0.786492
Recall,0.395534
Precision,0.794934
F1,0.528235
NDCG@5,0.882152


## SVD

In [39]:
#SVD
#initialize and train SVD model
svdMod = SVD(n_factors=200,n_epochs=25,biased=False,random_state=123).fit(df_train)
#predict
svdModPreds = svdMod.test(df_test)
#eval
accuracySVD = accuracy.rmse(svdModPreds)

print(f"SVD RMSE: ",accuracySVD)

RMSE: 0.9798
SVD RMSE:  0.979814783041079


In [40]:
evalSVD = evaluate(prediction=svdModPreds, topn=5, rating_cutoff=4, excl_impossible=True)
evalSVD

Excluded 58 (30000) samples. 29942 remaining ...
Excluded 58 (30000) samples. 29942 remaining ...


Unnamed: 0,value
RMSE,0.977868
MAE,0.774175
Recall,0.295186
Precision,0.873359
F1,0.441238
NDCG@5,0.89212


## Cocluster

In [41]:
#cocluster
#initialize and fit
CoClusterMod = CoClustering(n_cltr_u=10,n_cltr_i=10,n_epochs=50,random_state=123).fit(df_train)
#predict
clustModPreds = CoClusterMod.test(df_test)
#eval
accuracyClust = accuracy.rmse(clustModPreds)

print(f"Cocluster RMSE: ",accuracyClust)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  CoClusterMod = CoClustering(n_cltr_u=10,n_cltr_i=10,n_epochs=50,random_state=123).fit(df_train)


RMSE: 0.9872
Cocluster RMSE:  0.9871584952366663


In [42]:
evalCocluster = evaluate(prediction=clustModPreds, topn=5, rating_cutoff=4, excl_impossible=True)
evalCocluster

Excluded 0 (30000) samples. 30000 remaining ...
Excluded 0 (30000) samples. 30000 remaining ...


Unnamed: 0,value
RMSE,0.987158
MAE,0.769135
Recall,0.439207
Precision,0.811801
F1,0.570018
NDCG@5,0.884549


# Exercise 3

- fit and evaluate the content-based model using <font color="blue">IESEGRecSys.model.ContentBased</font>
    - Hint: 
        - Setup the model with a "NN" (nearest neighbors) parameter i.e. NN=10
        - Fit the model on the movie content (.fit() method).
        - Fit the model on the ratings in the train set (.fit_ratings() method)
        - Predict ratings for the test set.
- compare evaluation performance

```python
# Content Based as a Class
from sklearn.metrics.pairwise import cosine_similarity
import surprise

# Content Based as a Class
class ContentBased:

    def __init__(self, NN):
        self.NN = NN
        self.fitted = {"content":False, "ratings":False}
        
    def fit(self, content_data):

        self.items = content_data.index.values
        self.item_dim = len(self.items)
        # check for duplicate items
        assert (len(self.items) == len(set(self.items))), "Duplicate items in content data!"

        # compute similarity
        self.matrix = cosine_similarity(content_data.values)
        np.fill_diagonal(self.matrix, 0)
        
        self.matrixNN = self.matrix.copy()

        # filter similarity matrix for NN nearest neighbors (constraint: non-negative similarity)
        for i in range(self.item_dim):
            crit_val = max(-np.sort(-self.matrix[i])[self.NN-1], 0)
            self.matrixNN[i][self.matrixNN[i] < crit_val] = 0.0
        
        self.fitted["content"] = True

    # helper -> transform surprise.trainset.Trainset to pd.DataFrame
    def _trainset2list(self, trainset):
        return pd.DataFrame([(trainset.to_raw_uid(u), trainset.to_raw_iid(i), r) for (u, i, r) in trainset.all_ratings()], columns=["user", "item", "rating"])

    def fit_ratings(self, df):

        if not self.fitted["content"]:
            raise Exception("Fit model on content data!")

        if isinstance(df, surprise.trainset.Trainset):
            df = self._trainset2list(df)
        
        # fix unknown items
        unknown_items = list(set(df["item"]) - set(self.items))
        if len(unknown_items) > 0:
            print(f"Warning {len(unknown_items)} items are not included in content data: {unknown_items}")
        df = df[df["item"].isin(self.items)].reset_index(drop=True)

        # store user data
        self.users = np.unique(df["user"])
        self.user_dim = len(self.users)

        # fix missing items
        missing_items = list(set(self.items) - set(df["item"]))
        if len(missing_items) > 0: 
            fix_df = pd.DataFrame([{"user":np.nan, "item":i, "rating":np.nan} for i in missing_items])
            df = df.append(fix_df).reset_index(drop=True)

        # pivot 
        df_pivot = df.pivot_table(index='user', values='rating', columns='item', dropna=False).reindex(self.users)

        # row-wise (user) average
        self.user_avg = np.array(np.mean(df_pivot, axis=1))
        self.global_mean = np.mean(self.user_avg)

        # center ratings
        df_pivot = df_pivot.sub(self.user_avg, axis=0).fillna(0)

        # predict ratings for each item 
        denom = self.matrixNN.sum(axis=0) # column sums
        self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]

        # replace NA values with mean
        # prediction[np.isnan(prediction)] = self.global_mean

        self.fitted["ratings"] = True
    
    # get predicted value for user-item combination
    def predict(self, user, item, r_ui=None):
        details = {"was_impossible":False}

        # check whether user and item are unknown -> default = global average rating
        if self.knows_user(user) & self.knows_item(item):

            # convert user & item in internal ids
            iid = np.where(self.items == item)[0].item()
            uid = np.where(self.users == user)[0].item()

            # inference prediction
            est = self.prediction[uid, iid]
            
            if np.isnan(est): 
                est = self.global_mean
                details["was_impossible"] = True
            return surprise.Prediction(user, item, r_ui, est, details)
        
        else:
            details["was_impossible"] = True
            details["reason"] = "User or item unknown"
            return surprise.Prediction(user, item, r_ui, self.global_mean, details)

    # predict entire testset
    def test(self, testset):
        if not self.fitted["ratings"]:
            raise Exception("Fit model on ratings data!")
        return [self.predict(user=u,item=i,r_ui=r) for (u,i,r) in testset]

    def knows_user(self, user):
        return user in self.users   

    def knows_item(self, item):
        return item in self.items        

    # get topn most similar items 
    def get_most_similar(self, item, topn=5):

        # get iid
        if self.knows_item(item):
            iid = np.where(self.items == item)[0].item()
        else:
            raise Exception(f"Item {item} unknown ...")
        
        list_iids = (-self.matrix[iid]).argsort()[:topn]
        return self.items[list_iids]

    def get_similarities(self):
        print('Cosine similarities shape: ({}, {}) items x items'.format(self.item_dim, self.item_dim))
        return self.matrix
```

In [80]:
#keep everything besides title and released which is a string and timestamp
movie_features = movie[['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western',
       'very_old', 'old', 'new', 'very_new']]

cbMod = ContentBased(NN=10)
cbMod.fit(movie_features)
cbMod._trainset2list(df_train)
cbMod.fit_ratings(df_train)
cbPreds = cbMod.test(df_test)

  self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]


In [82]:
evalCb = evaluate(prediction=cbPreds, topn=5, rating_cutoff=4, excl_impossible=True)
evalCb

Excluded 121 (30000) samples. 29879 remaining ...
Excluded 121 (30000) samples. 29879 remaining ...


Unnamed: 0,value
RMSE,1.054653
MAE,0.844821
Recall,0.188797
Precision,0.781569
F1,0.304129
NDCG@5,0.846478
