# Packages

In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import random as rd
from surprise import AlgoBase
from surprise.prediction_algorithms.predictions import PredictionImpossible

from loaders import load_ratings
from loaders import load_items
from constants import Constant as C

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Explore and select content features

In [11]:
df_items = load_items()
df_ratings = load_ratings()


# Example 1 : create title_length features
df_features = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('n_character_title')
display(df_features.head())

# 2. Year_of_release
df_features = df_items[C.LABEL_COL].str.extract(r'\((\d{4})\)')[0].astype('Int64').to_frame('release_year')
display(df_features.head())

# 3. Genre_list
df_genre_list = df_items[C.GENRES_COL].str.split('|').to_frame('genre_list')
display(df_genre_list.head())

# 4. Genre_one_hot_encoding
 #Étape 1 : Exploser les listes de genres
df_exploded = df_genre_list.explode('genre_list')
# Étape 2 : Créer les variables dummies (one-hot encoding)
df_dummies = pd.get_dummies(df_exploded['genre_list'])
# Étape 3 : Reformer le DataFrame initial avec les one-hot encodings regroupés par index
df_genres = df_dummies.groupby(df_exploded.index).sum()
# Assure-toi que l'index corresponde à celui de df_items si nécessaire :
df_genres = df_genres.reindex(df_items.index).fillna(0).astype(int)
display(df_genres.head())
# (explore here other features)


Unnamed: 0_level_0,n_character_title
movieId,Unnamed: 1_level_1
4993,57
5952,45
527,23
2028,26
4308,19


Unnamed: 0_level_0,release_year
movieId,Unnamed: 1_level_1
4993,2001
5952,2002
527,1993
2028,1998
4308,2001


Unnamed: 0_level_0,genre_list
movieId,Unnamed: 1_level_1
4993,"[Adventure, Fantasy]"
5952,"[Adventure, Fantasy]"
527,"[Drama, War]"
2028,"[Action, Drama, War]"
4308,"[Drama, Musical, Romance]"


Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Drama,Fantasy,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,War
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4993,0,1,0,0,0,0,1,0,0,0,0,0,0,0
5952,0,1,0,0,0,0,1,0,0,0,0,0,0,0
527,0,0,0,0,0,1,0,0,0,0,0,0,0,1
2028,1,0,0,0,0,1,0,0,0,0,0,0,0,1
4308,0,0,0,0,0,1,0,0,0,1,0,1,0,0


# Build a content-based model
When ready, move the following class in the *models.py* script

In [33]:
class ContentBased(AlgoBase):
    def __init__(self, features_methods, regressor_method):
        AlgoBase.__init__(self)
        self.features_method = features_methods
        self.regressor_method = regressor_method
        self.content_features = self.create_content_features(features_methods)
        

    def create_content_features(self, features_methods):
        """Content Analyzer"""
        df_items = load_items()
        df_features = pd.DataFrame(index=df_items.index)
        if features_methods is None:
           df_features = pd.DataFrame(index=df_items.index)
        if isinstance(features_methods, str):
         features_methods = [features_methods]
        
        for feature_method in features_methods:
         if feature_method == "title_length": # a naive method that creates only 1 feature based on title length
            df_title_length = df_items[C.LABEL_COL].apply(lambda x: len(x)).to_frame('title_length')
            df_title_length['title_length'] = df_title_length['title_length'].fillna(0).astype(int)
            mean_title_length = int(df_title_length['title_length'].replace(0, np.nan).mean())
            df_title_length.loc[df_title_length['title_length'] == 0, 'title_length'] = mean_title_length
            # Normaliser la longueur des titre entre 0 et 1
            title_length_min = df_title_length['title_length'].min()
            title_length_max = df_title_length['title_length'].max()
            df_title_length['title_length'] = (df_title_length['title_length'] - title_length_min) / (title_length_max - title_length_min)
            df_features = pd.concat([df_features, df_title_length], axis=1)
         elif feature_method == "Year_of_release":
            year = df_items[C.LABEL_COL].str.extract(r'\((\d{4})\)')[0].astype(float)
            df_year = year.to_frame(name='year_of_release')
            mean_year = df_year.replace(0, np.nan).mean().iloc[0]
            df_year['year_of_release'] = df_year['year_of_release'].fillna(mean_year).astype(int)
             # Normaliser les dates de sortie 
            year_min = df_year['year_of_release'].min()
            year_max = df_year['year_of_release'].max()
            df_year['year_of_release'] = (df_year['year_of_release'] - year_min) / (year_max - year_min)
            df_features = pd.concat([df_features, df_year], axis=1)
         elif feature_method =="average_ratings":
            # moyenne des notes par films
            average_rating = df_ratings.groupby('movieId')[C.RATING_COL].mean().rename('average_rating').to_frame()
            global_avg = df_ratings['rating'].mean()
            average_rating['average_rating'] = average_rating['average_rating'].fillna(global_avg)
            # Normaliser la moyenne des notes par films
            avg_rating_min = average_rating['average_rating'].min()
            avg_rating_max = average_rating['average_rating'].max()
            average_rating['average_rating'] = (average_rating['average_rating'] - avg_rating_min) / (avg_rating_max - avg_rating_min)
            df_features = df_features.join(average_rating, how='left')
         elif feature_method =="count_ratings":
             # Count the number of ratings for each movie
            rating_count = df_ratings.groupby('movieId')[C.RATING_COL].size().rename('rating_count').to_frame()
            rating_count['rating_count'] = rating_count['rating_count'].fillna(0).astype(int)
            mean_rating_count = int(rating_count['rating_count'].replace(0, np.nan).mean())
            rating_count.loc[rating_count['rating_count'] == 0, 'rating_count'] = mean_rating_count
                # Normalize the rating count
            rating_count_min = rating_count['rating_count'].min()
            rating_count_max = rating_count['rating_count'].max()
            rating_count['rating_count'] = (rating_count['rating_count'] - rating_count_min) / (rating_count_max - rating_count_min)
            df_features = df_features.join(rating_count, how='left')
             
         else: # (implement other feature creations here)
            raise NotImplementedError(f'Feature method {features_methods} not yet implemented')
        return df_features
    

    def fit(self, trainset):
        """Profile Learner"""
        self.content_features = self.create_content_features(self.features_method)
        AlgoBase.fit(self, trainset)
        self.user_profile = {u: None for u in trainset.all_users()}
        for u in self.user_profile:
            user_items = trainset.ur[u]
            if len(user_items) > 0:
                # Sépare les item_ids internes et les notes
                user_ratings = self.trainset.ur[u]
                df_user = pd.DataFrame(user_ratings, columns=['inner_item_id', 'user_ratings'])
                # Conversion des item_id internes (Surprise) en item_id "raw" (MovieLens)
                df_user["item_id"] = df_user["inner_item_id"].map(self.trainset.to_raw_iid)
                # Fusion avec les features de contenu (sur l'index = item_id raw)
                df_user = df_user.merge(self.content_features, how='left', left_on='item_id', right_index=True)
                # Préparation des features et des cibles pour l'entraînement
                feature_names = list(self.content_features.columns)
                X = df_user[feature_names].values
                y = df_user['user_ratings'].values
                # Gère les NaNs dans les features
                X = np.nan_to_num(X)

     
                if self.regressor_method == 'linear': # Use linear regression
                    model = LinearRegression(fit_intercept=True)
                elif self.regressor_method == 'lasso':
                    model = Lasso(alpha=0.1)
                elif self.regressor_method == 'random_forest':
                    model = RandomForestRegressor(n_estimators=10, max_depth=10, random_state=42)
                elif self.regressor_method== 'neural_network':
                    model = MLPRegressor(hidden_layer_sizes=(60, 60), max_iter=2500, learning_rate_init=0.01, alpha=0.0001, random_state=42)
                elif self.regressor_method == 'decision_tree':
                    model = DecisionTreeRegressor(max_depth=10, random_state=42)
                elif self.regressor_method == 'ridge':
                    model = Ridge(alpha=1.0)
                elif self.regressor_method == 'gradient_boosting':
                    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
                elif  self.regressor_method == 'knn':
                    model = KNeighborsRegressor(n_neighbors=5)
                elif self.regressor_method == 'elastic_net':
                    model = ElasticNet(alpha=0.1, l1_ratio=0.5)

                else:
                    self.user_profile[u] = None
                    
                model.fit(X, y)
                self.user_profile[u] = model

            else:
             self.user_profile[u] = None
             
        
    def estimate(self, u, i):
        """Scoring component used for item filtering"""
        # First, handle cases for unknown users and items
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        if self.user_profile[u] is None:
            return self.trainset.global_mean

        raw_item_id = self.trainset.to_raw_iid(i)
        if raw_item_id in self.content_features.index:
            item_features = self.content_features.loc[raw_item_id].values.reshape(1, -1)
        else:
            return self.trainset.global_mean
    
        if self.regressor_method == 'linear':
            score = self.user_profile[u].predict(item_features)[0]
        elif self.regressor_method in [
        'linear',
        'lasso',
        'random_forest',
        'neural_network',
        'decision_tree',
        'ridge',
        'gradient_boosting',
        'knn',
        'elastic_net' ]:
          score = self.user_profile[u].predict(item_features)[0]

        else:
            score=None
            

        return score


The following script test the ContentBased class

In [35]:
def test_contentbased_class(feature_method, regressor_method):
    """Test the ContentBased class.
    Tries to make a prediction on the first (user,item ) tuple of the anti_test_set
    """
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    content_algo = ContentBased(feature_method, regressor_method)
    content_algo.fit(train_set)
    anti_test_set_first = train_set.build_anti_testset()[0]
    prediction = content_algo.predict(anti_test_set_first[0], anti_test_set_first[1])
    print(prediction)

# (call here the test functions with different regressor methods)

# Test 1 : prédiction aléatoire entre 0.5 et 5
#test_contentbased_class(feature_method=None, regressor_method='random_score')

# Test 2 : prédiction aléatoire parmi les notes données par l'utilisateur
#test_contentbased_class(feature_method=None, regressor_method='random_sample')

test_contentbased_class(["title_length", "Year_of_release"], 'lasso')
test_contentbased_class('Year_of_release','lasso')
test_contentbased_class('title_length','lasso')
test_contentbased_class('average_ratings','lasso')
test_contentbased_class('count_ratings','lasso')
#test_contentbased_class('title_length','lasso')
#test_contentbased_class('title_length','random_forest')
#test_contentbased_class('title_length','neural_network')
#test_contentbased_class('title_length','decision_tree')
#test_contentbased_class('title_length','ridge')
#test_contentbased_class('title_length','gradient_boosting')
#test_contentbased_class('title_length','knn')
#test_contentbased_class('title_length','elastic_net')



user: 11         item: 1214       r_ui = None   est = 1.65   {'was_impossible': False}
user: 11         item: 1214       r_ui = None   est = 2.17   {'was_impossible': False}
user: 11         item: 1214       r_ui = None   est = 1.76   {'was_impossible': False}
user: 11         item: 1214       r_ui = None   est = 2.84   {'was_impossible': False}
user: 11         item: 1214       r_ui = None   est = 3.28   {'was_impossible': False}
