# Content-based recommenders

Content-based recommenders in their recommendations rely purely on the features of users and items. Conceptually it can be expressed as a model of the form (personalized):

<center>
$$
    score \sim (user\_feature\_1, user\_feature\_2, ..., user\_feature\_k, item\_feature_1, item\_feature_2, ..., item\_feature_n)
$$
</center>

or (not personalized)

<center>
$$
    score \sim (item\_feature_1, item\_feature_2, ..., item\_feature_n)
$$
</center>


    + Content-based recommenders do not suffer from the cold-start problem for new items.
    - They do not use information about complex patterns of user-item interactions - what other similar users have already
    discovered and liked.

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
import time

pd.set_option('display.max_columns', 50)

from evaluation_and_testing.testing import evaluate_train_test_split_explicit
from evaluation_and_testing.testing import evaluate_leave_one_out_explicit
from evaluation_and_testing.testing import evaluate_train_test_split_implicit
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Load the data

In [4]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

display(ml_movies_df.head(10))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=1000, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of chosen interactions: {}".format(len(ml_ratings_df)))

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Number of chosen interactions: 9692


# Linear Regression Recommender

For every movie we transform its genres into one-hot encoded features and we normalize them, for every user we count percentages for all genres how often do they appear among films watched by the user, we multiply both vectors (for the item and the user) to obtain explanaytory variables, and then we fit a linear regression model to those features and actual ratings.

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MultiLabelBinarizer

from recommenders.recommender import Recommender

class LinearRegressionRecommender(Recommender):
    """
    Linear regression recommender class.
    """
    
    def __init__(self):
        """
        Initialize recommender params and variables.
        """
        self.model = None
        self.mlb = None
        self.users_dict = None
        self.user_features = None
        
        self.uses_dot_product = True
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        # Transform genres to a more code-friendly form
        
        interactions_df = pd.merge(interactions_df, items_df, on='item_id')
        interactions_df = self._transform_genres(interactions_df)
        
        # Prepare user features
        
        print("preparing user features")
        
        users_df = interactions_df[['user_id', 'genres']].copy()
        print("creating users_df")
        display(users_df.head(10))
        
        users_df = users_df.explode('genres')
        print("exploding genres")
        display(users_df.head(10))
        
        users_df['val'] = 1
        print("creating users column 'val' and setting it to 1")
        display(users_df.head(10))
        
        users_df = users_df.pivot_table(index='user_id', columns='genres', values='val', aggfunc='count')
        print("pivoting table chuj wie co to")
        display(users_df.head(10))
        
        users_df = users_df / users_df.sum(axis=1).values.reshape(-1, 1)
        print("users_df/users)df.sum(axis=1).values.reshape(-1, 1)")
        display(users_df.head(10))
        
        users_df = users_df.rename_axis(None, axis=1).fillna(0)
        print("renaming axis")
        display(users_df.head(10))
        
        users_df = users_df.add_prefix('user_')
        print("adding prefix _user")
        display(users_df.head(10))
        
#         display(users_df.head(10))
        
        self.users_dict = users_df.to_dict('index')
        
        self.user_features = users_df.columns.tolist()
        
        interactions_df = interactions_df.merge(users_df, on='user_id')
#         display(interactions_df.head(10))
                
        # Prepare item features
        
        # Transform genres into binary values
        
        self.mlb = MultiLabelBinarizer()
        interactions_df = interactions_df.join(
            pd.DataFrame(self.mlb.fit_transform(interactions_df.pop('genres')),
                         columns=self.mlb.classes_,
                         index=interactions_df.index))
        
        # Normalize the values so that each movie's genres sum up to 1
        
        interactions_df[self.mlb.classes_] = interactions_df[self.mlb.classes_] \
            / interactions_df[self.mlb.classes_].sum(axis=1).values.reshape(-1, 1)
        
#         display(interactions_df.loc[:, self.mlb.classes_].head(10))

        # Prepare input data and fit the model
    
        # Dot product for personalization
        if self.uses_dot_product:
            interactions_df[self.mlb.classes_] = interactions_df[self.mlb.classes_] \
                * interactions_df[self.user_features].values
    
#         display(interactions_df.head(10))
        
        if self.uses_dot_product:  # Personalized
            x = interactions_df.loc[:, self.mlb.classes_].values
        else:  # Non-personalized
            x = interactions_df.loc[:, list(self.mlb.classes_) + self.user_features].values
        
        y = interactions_df['rating'].values
    
        self.model = LinearRegression().fit(x, y)
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        # Transform the item to be scored into proper features
        
        items_df = items_df.copy()
        items_df = self._transform_genres(items_df)
        
        items_df = items_df.join(
            pd.DataFrame(self.mlb.transform(items_df.pop('genres')),
                         columns=self.mlb.classes_,
                         index=items_df.index))
        
        items_df[self.mlb.classes_] = items_df[self.mlb.classes_] \
            / items_df[self.mlb.classes_].sum(axis=1).values.reshape(-1, 1)

        # Score the item
    
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        for ix, user in users_df.iterrows(): 
            if user['user_id'] in self.users_dict:
                user_df = pd.DataFrame.from_dict({user['user_id']: self.users_dict[user['user_id']]}, orient='index')
            else:
                user_df = pd.DataFrame.from_dict(
                    {user['user_id']: [1 / len(self.user_features)]*len(self.user_features)}, orient='index')
                user_df.columns = self.user_features
            display(user_df)
            display(items_df)
            input_df = items_df.copy()
            
            if self.uses_dot_product:
                input_df[self.mlb.classes_] = items_df[self.mlb.classes_] * user_df.values
                display(input_df)
                scores = self.model.predict(input_df.loc[:, self.mlb.classes_].values)
            else:
                input_df = input_df.merge(user_df, how='cross')
                display(input_df)
                scores = self.model.predict(input_df.loc[:, list(self.mlb.classes_) + self.user_features].values)
    
            chosen_pos = np.argsort(-scores)[:n_recommendations]
        
            user_recommendations = []
            for item_pos in chosen_pos:
                user_recommendations.append(
                    {
                        'user_id': user['user_id'],
                        'item_id': input_df.iloc[item_pos]['item_id'],
                        'score': scores[item_pos]
                    }
                )
                
            user_recommendations = pd.DataFrame(user_recommendations)

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations
    
    def _transform_genres(self, df):
        """
        Transforms a string with genres into a list of cleaned genre names.
        
        :param pd.DataFrame df: A DataFrame with 'genres' column.
        """
        df.loc[:, 'genres'] = df['genres'].str.replace("-", "_", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace(" ", "_", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace("(", "", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace(")", "", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.lower()
        df.loc[:, 'genres'] = df['genres'].str.split("|")
        return df

In [14]:
# Print movies watched by user 3, 5, 39

active_user_movies = ml_df.loc[
    (ml_df['user_id'] == 3) | (ml_df['user_id'] == 5) | (ml_df['user_id'] == 39)].sort_values(['user_id', 'item_id'])
print("Active users history")
display(active_user_movies)

Active users history


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
18810,3,5181,5.0,1306463718,Hangar 18 (1980),Action|Sci-Fi|Thriller
18827,3,7991,5.0,1306463684,Death Race 2000 (1975),Action|Sci-Fi
18832,3,70946,5.0,1306463815,Troll 2 (1990),Fantasy|Horror
573,5,50,4.0,847434881,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
19321,5,232,4.0,847435292,Eat Drink Man Woman (Yin shi nan nu) (1994),Comedy|Drama|Romance
25824,5,266,1.0,847435311,Legends of the Fall (1994),Drama|Romance|War|Western
19722,5,475,5.0,847435311,In the Name of the Father (1993),Drama
20123,5,595,5.0,847434832,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
588,39,50,5.0,974788030,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
45366,39,858,4.0,974789111,"Godfather, The (1972)",Crime|Drama


In [15]:
# Quick test of the recommender

lr_recommender = LinearRegressionRecommender()
lr_recommender.uses_dot_product = True
lr_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = lr_recommender.recommend(pd.DataFrame([[3], [5], [39]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
display(recommendations)

preparing user features
creating users_df


Unnamed: 0,user_id,genres
0,1,"[crime, mystery, thriller]"
1,5,"[crime, mystery, thriller]"
2,6,"[crime, mystery, thriller]"
3,7,"[crime, mystery, thriller]"
4,8,"[crime, mystery, thriller]"
5,16,"[crime, mystery, thriller]"
6,17,"[crime, mystery, thriller]"
7,18,"[crime, mystery, thriller]"
8,23,"[crime, mystery, thriller]"
9,24,"[crime, mystery, thriller]"


exploding genres


Unnamed: 0,user_id,genres
0,1,crime
0,1,mystery
0,1,thriller
1,5,crime
1,5,mystery
1,5,thriller
2,6,crime
2,6,mystery
2,6,thriller
3,7,crime


creating users column 'val' and setting it to 1


Unnamed: 0,user_id,genres,val
0,1,crime,1
0,1,mystery,1
0,1,thriller,1
1,5,crime,1
1,5,mystery,1
1,5,thriller,1
2,6,crime,1
2,6,mystery,1
2,6,thriller,1
3,7,crime,1


pivoting table chuj wie co to


genres,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film_noir,horror,imax,musical,mystery,no_genres_listed,romance,sci_fi,thriller,war,western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,10.0,8.0,3.0,5.0,8.0,6.0,,4.0,3.0,,,,1.0,1.0,,5.0,4.0,6.0,2.0,1.0
2,2.0,,,,,2.0,,6.0,,,,,,,,1.0,1.0,3.0,,1.0
3,2.0,,,,,,,,1.0,,1.0,,,,,,2.0,1.0,,
4,1.0,1.0,2.0,3.0,11.0,5.0,,15.0,2.0,1.0,,1.0,3.0,,,7.0,,2.0,1.0,1.0
5,,,1.0,1.0,1.0,1.0,,3.0,1.0,,,1.0,1.0,1.0,,3.0,,1.0,1.0,1.0
6,8.0,5.0,2.0,5.0,20.0,4.0,,14.0,5.0,,3.0,1.0,2.0,3.0,,8.0,5.0,8.0,2.0,1.0
7,5.0,3.0,1.0,2.0,2.0,4.0,,7.0,2.0,,1.0,2.0,1.0,1.0,,3.0,4.0,5.0,2.0,
8,1.0,1.0,,1.0,1.0,1.0,,,,,,,,1.0,,,1.0,1.0,,
9,,,,,1.0,,,2.0,,,,,,,,,,2.0,,
10,3.0,4.0,,,6.0,1.0,,3.0,1.0,,,3.0,1.0,1.0,,4.0,2.0,,,


users_df/users)df.sum(axis=1).values.reshape(-1, 1)


genres,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film_noir,horror,imax,musical,mystery,no_genres_listed,romance,sci_fi,thriller,war,western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,,0.059701,0.044776,,,,0.014925,0.014925,,0.074627,0.059701,0.089552,0.029851,0.014925
2,0.125,,,,,0.125,,0.375,,,,,,,,0.0625,0.0625,0.1875,,0.0625
3,0.285714,,,,,,,,0.142857,,0.142857,,,,,,0.285714,0.142857,,
4,0.017857,0.017857,0.035714,0.053571,0.196429,0.089286,,0.267857,0.035714,0.017857,,0.017857,0.053571,,,0.125,,0.035714,0.017857,0.017857
5,,,0.058824,0.058824,0.058824,0.058824,,0.176471,0.058824,,,0.058824,0.058824,0.058824,,0.176471,,0.058824,0.058824,0.058824
6,0.083333,0.052083,0.020833,0.052083,0.208333,0.041667,,0.145833,0.052083,,0.03125,0.010417,0.020833,0.03125,,0.083333,0.052083,0.083333,0.020833,0.010417
7,0.111111,0.066667,0.022222,0.044444,0.044444,0.088889,,0.155556,0.044444,,0.022222,0.044444,0.022222,0.022222,,0.066667,0.088889,0.111111,0.044444,
8,0.125,0.125,,0.125,0.125,0.125,,,,,,,,0.125,,,0.125,0.125,,
9,,,,,0.2,,,0.4,,,,,,,,,,0.4,,
10,0.103448,0.137931,,,0.206897,0.034483,,0.103448,0.034483,,,0.103448,0.034483,0.034483,,0.137931,0.068966,,,


renaming axis


Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film_noir,horror,imax,musical,mystery,no_genres_listed,romance,sci_fi,thriller,war,western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.0,0.0,0.0,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925
2,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.1875,0.0,0.0625
3,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.285714,0.142857,0.0,0.0
4,0.017857,0.017857,0.035714,0.053571,0.196429,0.089286,0.0,0.267857,0.035714,0.017857,0.0,0.017857,0.053571,0.0,0.0,0.125,0.0,0.035714,0.017857,0.017857
5,0.0,0.0,0.058824,0.058824,0.058824,0.058824,0.0,0.176471,0.058824,0.0,0.0,0.058824,0.058824,0.058824,0.0,0.176471,0.0,0.058824,0.058824,0.058824
6,0.083333,0.052083,0.020833,0.052083,0.208333,0.041667,0.0,0.145833,0.052083,0.0,0.03125,0.010417,0.020833,0.03125,0.0,0.083333,0.052083,0.083333,0.020833,0.010417
7,0.111111,0.066667,0.022222,0.044444,0.044444,0.088889,0.0,0.155556,0.044444,0.0,0.022222,0.044444,0.022222,0.022222,0.0,0.066667,0.088889,0.111111,0.044444,0.0
8,0.125,0.125,0.0,0.125,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.125,0.125,0.0,0.0
9,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0
10,0.103448,0.137931,0.0,0.0,0.206897,0.034483,0.0,0.103448,0.034483,0.0,0.0,0.103448,0.034483,0.034483,0.0,0.137931,0.068966,0.0,0.0,0.0


adding prefix _user


Unnamed: 0_level_0,user_action,user_adventure,user_animation,user_children,user_comedy,user_crime,user_documentary,user_drama,user_fantasy,user_film_noir,user_horror,user_imax,user_musical,user_mystery,user_no_genres_listed,user_romance,user_sci_fi,user_thriller,user_war,user_western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.0,0.0,0.0,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925
2,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.1875,0.0,0.0625
3,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.285714,0.142857,0.0,0.0
4,0.017857,0.017857,0.035714,0.053571,0.196429,0.089286,0.0,0.267857,0.035714,0.017857,0.0,0.017857,0.053571,0.0,0.0,0.125,0.0,0.035714,0.017857,0.017857
5,0.0,0.0,0.058824,0.058824,0.058824,0.058824,0.0,0.176471,0.058824,0.0,0.0,0.058824,0.058824,0.058824,0.0,0.176471,0.0,0.058824,0.058824,0.058824
6,0.083333,0.052083,0.020833,0.052083,0.208333,0.041667,0.0,0.145833,0.052083,0.0,0.03125,0.010417,0.020833,0.03125,0.0,0.083333,0.052083,0.083333,0.020833,0.010417
7,0.111111,0.066667,0.022222,0.044444,0.044444,0.088889,0.0,0.155556,0.044444,0.0,0.022222,0.044444,0.022222,0.022222,0.0,0.066667,0.088889,0.111111,0.044444,0.0
8,0.125,0.125,0.0,0.125,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.125,0.125,0.0,0.0
9,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0
10,0.103448,0.137931,0.0,0.0,0.206897,0.034483,0.0,0.103448,0.034483,0.0,0.0,0.103448,0.034483,0.034483,0.0,0.137931,0.068966,0.0,0.0,0.0


sum of columns


user_id
1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     1.0
7     1.0
8     1.0
9     1.0
10    1.0
dtype: float64

TypeError: unsupported type: <class 'str'>

### Train-test split test

In [5]:
lr_recommender = LinearRegressionRecommender()

t0 = time.time()

results = [['LinearRegressionRecommender'] + list(evaluate_train_test_split_explicit(
    lr_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, seed=seed))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,LinearRegressionRecommender,1.016594,0.348461,0.230529


Total evaluation time: 47.25206780433655


### Leave-one-out test

In [6]:
lr_recommender = LinearRegressionRecommender()

t0 = time.time()

results = [['LinearRegressionRecommender'] + list(evaluate_leave_one_out_explicit(
    lr_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, seed=seed))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,LinearRegressionRecommender,1.045415,0.401399,0.242301


Total evaluation time: 54.93246841430664


# TF-IDF Recommender
TF-IDF stands for term frequencyâ€“inverse document frequency. Typically Tf-IDF method is used to assign keywords (words describing the gist of a document) to documents in a corpus of documents.

In our case we will treat users as documents and genres as words.

Term-frequency is given by the following formula:
<center>
$$
    \text{tf}(g, u) = f_{g, u}
$$
</center>
where $f_{g, i}$ is the number of times genre $g$ appear for movies watched by user $u$.

Inverse document frequency is defined as follows:
<center>
$$
    \text{idf}(g) = \log \frac{N}{n_g}
$$
</center>
where $N$ is the number of users and $n_g$ is the number of users with $g$ in their genres list.

Finally, tf-idf is defined as follows:
<center>
$$
    \text{tfidf}(g, u) = \text{tf}(g, u) \cdot \text{idf}(g)
$$
</center>

In our case we will measure how often a given genre appears for movies watched by a given user vs how often it appears for all users. To obtain a movie score we will take the average of its genres' scores for this user.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TFIDFRecommender(Recommender):
    """
    Recommender based on the TF-IDF method.
    """
    
    def __init__(self):
        """
        Initialize base recommender params and variables.
        """
        self.tfidf_scores = None
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        self.tfidf_scores = defaultdict(lambda: 0.0)

        # Prepare the corpus for tfidf calculation
        
        interactions_df = pd.merge(interactions_df, items_df, on='item_id')
#         print("Training set")
#         display(interactions_df.head(10))
#         print()
        user_genres = interactions_df.loc[:, ['user_id', 'genres']]
        user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("-", "_", regex=False)
        user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace(" ", "_", regex=False)
        user_genres = user_genres.groupby('user_id').aggregate(lambda x: "|".join(x))
        user_genres.loc[:, 'genres'] = user_genres['genres'].str.replace("|", " ", regex=False)
#         print("User genres")
#         display(user_genres.head(10))
#         print()
        user_ids = user_genres.index.tolist()
        genres_corpus = user_genres['genres'].tolist()
#         print("Genres corpus")
#         print(genres_corpus)
#         print()
        
        # Calculate tf-idf scores
        
        vectorizer = TfidfVectorizer()
        tfidf_scores = vectorizer.fit_transform(genres_corpus)
        
        # Transform results into a dict {(user_id, genre): score}
        
        for u in range(tfidf_scores.shape[0]):
            for g in range(tfidf_scores.shape[1]):
                self.tfidf_scores[(user_ids[u], vectorizer.get_feature_names()[g])] = tfidf_scores[u, g]

#         print("TF-IDF scores")
#         print(self.tfidf_scores)
#         print()
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        # Transform genres to a unified form used by the vectorizer
        
        items_df = items_df.copy()
        items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
        items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
        items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
        items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
                
        # Score items    
        
        for uix, user in users_df.iterrows():
            items = []
            for iix, item in items_df.iterrows():
                score = 0.0
                for genre in item['genres']:
                    score += self.tfidf_scores[(user['user_id'], genre)]
                score /= len(item['genres'])
                items.append((item['item_id'], score))
                
            items = sorted(items, key=lambda x: x[1], reverse=True)
            user_recommendations = pd.DataFrame({'user_id': user['user_id'],
                                                 'item_id': [item[0] for item in items][:n_recommendations],
                                                 'score': [item[1] for item in items][:n_recommendations]})

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations

In [67]:
# Print movies watched by user 3, 5, 39

active_user_movies = ml_df.loc[
    (ml_df['user_id'] == 3) | (ml_df['user_id'] == 5) | (ml_df['user_id'] == 39)].sort_values(['user_id', 'item_id'])
print("Active users history")
display(active_user_movies)

Active users history


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
18810,3,5181,5.0,1306463718,Hangar 18 (1980),Action|Sci-Fi|Thriller
18827,3,7991,5.0,1306463684,Death Race 2000 (1975),Action|Sci-Fi
18832,3,70946,5.0,1306463815,Troll 2 (1990),Fantasy|Horror
573,5,50,4.0,847434881,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
19321,5,232,4.0,847435292,Eat Drink Man Woman (Yin shi nan nu) (1994),Comedy|Drama|Romance
25824,5,266,1.0,847435311,Legends of the Fall (1994),Drama|Romance|War|Western
19722,5,475,5.0,847435311,In the Name of the Father (1993),Drama
20123,5,595,5.0,847434832,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
588,39,50,5.0,974788030,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
45366,39,858,4.0,974789111,"Godfather, The (1972)",Crime|Drama


In [68]:
# Quick test of the recommender

tfidf_recommender = TFIDFRecommender()
tfidf_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = tfidf_recommender.recommend(pd.DataFrame([[3], [5], [39]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(recommendations)

Recommendations


Unnamed: 0,user_id,item_id,score,title,genres
0,3,2311,0.571887,2010: The Year We Make Contact (1984),Sci-Fi
1,3,2661,0.571887,It Came from Outer Space (1953),Sci-Fi
2,3,2698,0.571887,Zone 39 (1997),Sci-Fi
3,3,5468,0.571887,20 Million Miles to Earth (1957),Sci-Fi
4,3,51562,0.571887,Babylon 5: The Gathering (1993),Sci-Fi
5,3,147384,0.571887,Doctor Who: The Runaway Bride (2007),Sci-Fi
6,3,176371,0.571887,Blade Runner 2049 (2017),Sci-Fi
7,3,7991,0.550571,Death Race 2000 (1975),Action|Sci-Fi
8,3,74668,0.550571,District 13: Ultimatum (Banlieue 13 - Ultimatu...,Action|Sci-Fi
9,3,168252,0.550571,Logan (2017),Action|Sci-Fi


### Train-test split test

In [10]:
tfidf_recommender = TFIDFRecommender()

t0 = time.time()

results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.017897,0.044743,0.058166,0.09396,0.017897,0.033077,0.038368,0.049966


Total evaluation time: 67.2819995880127


### Leave-one-out test

In [11]:
tfidf_recommender = TFIDFRecommender()

t0 = time.time()

results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.003333,0.006667,0.006667,0.02,0.003333,0.005436,0.005436,0.009773


Total evaluation time: 198.83639669418335


In [9]:
interactions_df = ml_ratings_df.copy()
users_df = interactions_df[['user_id', 'genres']].copy()
display(users_df.head(10))
users_df = users_df.explode('genres')
display(users_df.head(10))
users_df['val'] = 1
display(users_df.head(10))
users_df = users_df.pivot_table(index='user_id', columns='genres', values='val', aggfunc='count')
display(users_df.head(10))
users_df = users_df / users_df.sum(axis=1).values.reshape(-1, 1)
display(users_df.head(10))
users_df = users_df.rename_axis(None, axis=1).fillna(0)
display(users_df.head(10))
users_df = users_df.add_prefix('user_')
display(users_df.head(10))

KeyError: "['genres'] not in index"