# Packages

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import random as rd
from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible

from loaders import load_ratings
from loaders import load_items
from constants import Constant as C

from sklearn.linear_model import LinearRegression

# Explore and select content features

In [2]:
df_items = load_items()
df_ratings = load_ratings()


# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head())

# 2. Year_of_release
df_features = df_items[C.LABEL_COL].str.extract(r'\((\d{4})\)')[0].astype('Int64').to_frame('release_year')
display(df_features.head())

# 3. Genre_list
df_genre_list = df_items[C.GENRES_COL].str.split('|').to_frame('genre_list')
display(df_genre_list.head())

# 4. Genre_one_hot_encoding
 #Étape 1 : Exploser les listes de genres
df_exploded = df_genre_list.explode('genre_list')
# Étape 2 : Créer les variables dummies (one-hot encoding)
df_dummies = pd.get_dummies(df_exploded['genre_list'])
# Étape 3 : Reformer le DataFrame initial avec les one-hot encodings regroupés par index
df_genres = df_dummies.groupby(df_exploded.index).sum()
# Assure-toi que l'index corresponde à celui de df_items si nécessaire :
df_genres = df_genres.reindex(df_items.index).fillna(0).astype(int)
display(df_genres.head())
# (explore here other features)


Unnamed: 0_level_0,n_character_title
movieId,Unnamed: 1_level_1
3,23
15,23
34,11
59,44
64,20


Unnamed: 0_level_0,release_year
movieId,Unnamed: 1_level_1
3,1995
15,1995
34,1995
59,1995
64,1996


Unnamed: 0_level_0,genre_list
movieId,Unnamed: 1_level_1
3,"[Comedy, Romance]"
15,"[Action, Adventure, Romance]"
34,"[Children, Drama]"
59,"[Drama, Mystery]"
64,"[Comedy, Romance]"


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
15,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
34,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
59,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
64,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


# Build a content-based model
When ready, move the following class in the *models.py* script

In [17]:
class ContentBased(AlgoBase):
    def __init__(self, features_method, regressor_method):
        AlgoBase.__init__(self)
        self.features_method = features_method 
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_method)

        

    def create_content_features(self, features_method):
        """Content Analyzer"""
        df_items = load_items()
        df_features = pd.DataFrame(index=df_items.index)
        if features_method is None:
           df_features = pd.DataFrame(index=df_items.index)
        elif features_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')
            df_title_length['title_length'] = df_title_length['title_length'].fillna(0).astype(int)
            mean_title_length = int(df_title_length['title_length'].replace(0, np.nan).mean())
            df_title_length.loc[df_title_length['title_length'] == 0, 'title_length'] = mean_title_length
            # Normaliser la longueur des titre entre 0 et 1
            title_length_min = df_title_length['title_length'].min()
            title_length_max = df_title_length['title_length'].max()
            df_title_length['title_length'] = (df_title_length['title_length'] - title_length_min) / (title_length_max - title_length_min)
            df_features = pd.concat([df_features, df_title_length], axis=1)
        elif features_method == "Year_of_release":
            year = df_items[C.LABEL_COL].str.extract(r'\((\d{4})\)')[0].astype(float)
            df_year = year.to_frame(name='year_of_release')
            mean_year = df_year.replace(0, np.nan).mean().iloc[0]
            df_year['year_of_release'] = df_year['year_of_release'].fillna(mean_year).astype(int)
             # Normaliser les dates de sortie 
            year_min = df_year['year_of_release'].min()
            year_max = df_year['year_of_release'].max()
            df_year['year_of_release'] = (df_year['year_of_release'] - year_min) / (year_max - year_min)
            df_features = df_features.join(df_year, how='left')
        else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_method} not yet implemented')
        return df_features
    

    def fit(self, trainset):
        """Profile Learner"""
        self.content_features = self.create_content_features(self.features_method)
        AlgoBase.fit(self, trainset)
        self.user_profile = {u: None for u in trainset.all_users()}
        for u in self.user_profile:
            user_items = trainset.ur[u]
            if len(user_items) > 0:
                # Sépare les item_ids internes et les notes
                user_ratings = self.trainset.ur[u]
                df_user = pd.DataFrame(user_ratings, columns=['inner_item_id', 'user_ratings'])
                # Conversion des item_id internes (Surprise) en item_id "raw" (MovieLens)
                df_user["item_id"] = df_user["inner_item_id"].map(self.trainset.to_raw_iid)
                # Fusion avec les features de contenu (sur l'index = item_id raw)
                df_user = df_user.merge(self.content_features, how='left', left_on='item_id', right_index=True)
                # Préparation des features et des cibles pour l'entraînement
                feature_names = list(self.content_features.columns)
                X = df_user[feature_names].values
                y = df_user['user_ratings'].values
                # Gère les NaNs dans les features
                X = np.nan_to_num(X)

                if self.regressor_method == 'random_score':
                    self.user_profile[u] = rd.uniform(0.5, 5.0)
                elif self.regressor_method == 'random_sample':
                    self.user_profile[u] = [rating for _, rating in self.trainset.ur[u]]
                elif self.regressor_method == 'linear':
                    # Use linear regression
                    model = LinearRegression(fit_intercept=True)
                    model.fit(X, y)
                    self.user_profile[u] = model
                else:
                    self.user_profile[u] = None
                    # implement here the regressor fitting) 
            else:
             self.user_profile[u] = None
             
        
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        # First, handle cases for unknown users and items
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        if self.user_profile[u] is None:
            return self.trainset.global_mean

        raw_item_id = self.trainset.to_raw_iid(i)
        if raw_item_id in self.content_features.index:
            item_features = self.content_features.loc[raw_item_id].values.reshape(1, -1)
        else:
            return self.trainset.global_mean
    
        if self.regressor_method == 'random_score':
            rd.seed()
            score = rd.uniform(0.5,5)

        elif self.regressor_method == 'random_sample':
            rd.seed()
            score = rd.choice(self.user_profile[u])
        elif self.regressor_method == 'linear':
            score = self.user_profile[u].predict(item_features)[0]

        else:
            score=None
            

        return score


The following script test the ContentBased class

In [18]:
def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)

# (call here the test functions with different regressor methods)

# Test 1 : prédiction aléatoire entre 0.5 et 5
test_contentbased_class(feature_method=None, regressor_method='random_score')

# Test 2 : prédiction aléatoire parmi les notes données par l'utilisateur
test_contentbased_class(feature_method=None, regressor_method='random_sample')

test_contentbased_class('title_length','linear')


user: 11         item: 1214       r_ui = None   est = 2.72   {'was_impossible': False}
user: 11         item: 1214       r_ui = None   est = 1.00   {'was_impossible': False}
user: 11         item: 1214       r_ui = None   est = 1.33   {'was_impossible': False}
