In [1]:
#Chargement des données
# Data
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os 
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import math
# Graphics
import seaborn as sns ; sns.set()

from sklearn.experimental import enable_halving_search_cv # noqa

from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import FunctionTransformer
# from sklearn.metrics import root_mean_squared_error

In [2]:
load_dotenv('BDD_URL.env')
BDD_URL = os.environ['BDD_URL']
engine = create_engine(BDD_URL)

SQL_filtre= """
SET search_path to principal;
SELECT *
from "filmview"
where 'Comedy' = ANY(string_to_array("genres", ','))
limit 10000;
"""
SQL= """
SET search_path to principal;
SELECT *
from "filmview"
where "runtimeMinutes" Is NOT null and "titleType" = 'movie' and "averageRating" is NOT NULL and "genres" is NOT NULL and "startYear" is NOT NULL and "isAdult" is NOT NULL
limit 70000;
"""
df = pd.read_sql(SQL, engine)
engine.dispose()

In [3]:
def BooleanToText (df):
    return df.apply(lambda x: 'True' if x == 1 else 'False')
def DateToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(1800, 2056, 5))  # Intervalles de 5
    labels = [f"Datebetween{start}and{start+4}" for start in range(1800, 2051, 5)]

    return pd.cut(df, bins=bins, labels=labels, right=False)
def RuntimeToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(0, 615, 15))  # Intervalles de 10h
    labels = [f"runtime_Between{start}and{start+15}" for start in range(0, 600, 15)]

    return pd.cut(df, bins=bins, labels=labels, right=False) #qcut 
def RatingToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(0, 12, 2))  
    labels = ['*','**','***','****','*****']

    return pd.cut(df, bins=bins, labels=labels, right=False)
def listTostr (df):
    return df.apply(lambda x: ' '.join(map(str, x)))

In [4]:
df['feature'] = df['primaryTitle'] + ' '
df['feature'] += df['titleType'] + ' '
df['feature'] += DateToCategory(df['startYear']).astype(str) + ' '
df['feature'] += RuntimeToCategory (df['runtimeMinutes']).astype(str)+ ' '
df['feature'] += df['genres'].str.replace(',', ' ') + ' '

df['feature'] += 'ADULT_'+BooleanToText (df['isAdult']).astype(str)+' '

df['feature'] += listTostr (df['Cate&names']).astype(str)+' '
df['feature'][0]

'The Resurrection of Jake the Snake movie Datebetween2015and2019 runtime_Between90and105 Biography Documentary Sport ADULT_False director_Steve_Yu editor_Dylan_Frymyer editor_Neely_Coe self_Louie_Benson editor_Nathan_Mowery self_Adam_Copeland cinematographer_Nicholas_Leone self_Joe_Case self_Steve_Austin '

In [5]:
# Séparation des caractéristiques et de la cible
X = df.drop(columns=["averageRating"])
y = df["averageRating"]

# Séparation des données d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)#, random_state=42



In [6]:
# Définition des colonnes numériques, textuelles et de description
numeric_features = ['startYear', 'runtimeMinutes']
boolean_features = 'isAdult'
text_features = 'titleType'
title = 'primaryTitle'#['primaryTitle''titleType', 'directors', 'writers']
genre = 'genres'
description_feature = 'feature'

In [7]:
# Création des transformers pour les colonnes numériques, booléennes, textuelles et de description
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

boolean_transformer = FunctionTransformer(lambda x: x.astype(bool).values.reshape(-1, 1)) 

text_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Vect_transformer = Pipeline([
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
    ('vect', CountVectorizer(decode_error='ignore', analyzer='char_wb')) #max_features=1000, analyzer="word"
])
tfidf_transformer = Pipeline([
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
    ('tf_idf', TfidfVectorizer(decode_error='ignore', analyzer='char_wb', max_features=1000)) #max_features=1000
])
# Création d'un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # ('num', numeric_transformer, numeric_features),
        # ('bool', boolean_transformer, boolean_features),
        # ('text', Vect_transformer, text_features),
        # ('title', tfidf_transformer, title),
        # ('genre', tfidf_transformer, genre),
        ('description', tfidf_transformer, description_feature)
    ])
preprocessor

In [8]:
# Création des pipelines pour chaque modèle
pipelines = {
    'Linear Regression': Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())]),
    'Ridge Regression': Pipeline([('preprocessor', preprocessor), ('regressor', Ridge())]),
    'Lasso Regression': Pipeline([('preprocessor', preprocessor), ('regressor', Lasso())]),
    'ElasticNet': Pipeline([('preprocessor', preprocessor), ('regressor', ElasticNet())]),
    'SVR': Pipeline([('preprocessor', preprocessor), ('regressor', SVR())]),
    'Random Forest Regression': Pipeline([('preprocessor', preprocessor), ('regressor', RandomForestRegressor())]),
    'Gradient Boosting Regression': Pipeline([('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor())]),
    
}

# Paramètres pour GridSearchCV pour chaque modèle
parameters = {
    'Linear Regression': {'regressor__fit_intercept': [True,False]},
    'Ridge Regression': {'regressor__alpha': [0.1, 2.0,5, 10.0]},
    'Lasso Regression': {'regressor__alpha': [0.1, 2.0,5, 10.0]},
    'ElasticNet': {'regressor__alpha': [0.1, 2.0, 10.0], 'regressor__l1_ratio': [0.1, 0.5, 0.9]},
    'SVR': {'regressor__kernel': ['linear', 'rbf'], 'regressor__C': [0.1, 1.0, 10.0]},
    'Random Forest Regression': {'regressor__n_estimators': [150,200], 'regressor__max_depth': [50,100]},# None, 
    'Gradient Boosting Regression': {'regressor__n_estimators': [150,200], 'regressor__max_depth': [50, 100]},
    
}

# Scoring : RMSE, R2 et MAE
scoring = {'RMSE': 'neg_root_mean_squared_error',
           'R2': 'r2',
           'MAE': 'neg_mean_absolute_error'}


In [9]:
def Grid(X_train, y_train, pipeline, parameters, cv=5):
  # Scoring
  #multi_scoring = {mean_squared_error,r2_score}
    
    
  # Grid search
  grid = GridSearchCV(pipeline, parameters,  scoring=scoring, refit='RMSE', cv=cv, n_jobs =-1, verbose = 0, error_score='raise')

  # Fit
  grid.fit(X_train, y_train)

  # Scores and results
  best_score = grid.best_score_.round(4)
  best_params = grid.best_params_
  training_time = grid.cv_results_['mean_fit_time'].mean().round(4)

  # Output
  return({
      'best_score': best_score,
      'best_params': best_params,
      'training_time': training_time,
      'fitted_model': grid.best_estimator_
  })

In [10]:
def afficheResults (grid):
    model_name = grid['fitted_model'].named_steps['regressor'].__class__.__name__
    print(f"{model_name} training time: {grid['training_time']}")
    print(f"Best {model_name} parameters: {grid['best_params']}")
    print(f"Best {model_name} score: {-grid['best_score']}")
 
    
    

In [11]:

# Boucle sur les modèles pour ajuster avec GridSearchCV
models = {}

# Boucle sur les modèles pour ajuster avec GridSearchCV et évaluation
for model_name, pipeline in pipelines.items():
    print(f"\n..............{model_name}..............................")
    grid_search = Grid(X_train, y_train, pipeline, parameters[model_name], cv=5)
    afficheResults (grid_search)
    
    best_model = grid_search['fitted_model']
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    Rmse = round(math.sqrt(mse), 4)
    models[model_name] = [best_model,Rmse]
    print(f"{model_name} RMSE: {Rmse}")
 


..............Linear Regression..............................
LinearRegression training time: 13.8559
Best LinearRegression parameters: {'regressor__fit_intercept': True}
Best LinearRegression score: 1.2228
Linear Regression RMSE: 1.2275

..............Ridge Regression..............................
Ridge training time: 15.5629
Best Ridge parameters: {'regressor__alpha': 0.1}
Best Ridge score: 1.2221
Ridge Regression RMSE: 1.2266

..............Lasso Regression..............................
Lasso training time: 11.1664
Best Lasso parameters: {'regressor__alpha': 0.1}
Best Lasso score: 1.2972
Lasso Regression RMSE: 1.303

..............ElasticNet..............................
ElasticNet training time: 12.9288
Best ElasticNet parameters: {'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.1}
Best ElasticNet score: 1.2971
ElasticNet RMSE: 1.303

..............SVR..............................


In [None]:
'''
# Boucle sur les modèles pour ajuster avec GridSearchCV
results = {}

# Boucle sur les modèles pour ajuster avec GridSearchCV et évaluation
for model_name, pipeline in pipelines.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(pipeline, parameters[model_name],  scoring=scoring, refit='RMSE', cv=5, n_jobs =-1, verbose = 1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = root_mean_squared_error(y_test, y_pred)
    results[model_name] = mse
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best {model_name} score: {-grid_search.best_score_}")
    print(f"{model_name} MSE: {mse}")
    print()
'''

'\n# Boucle sur les modèles pour ajuster avec GridSearchCV\nresults = {}\n\n# Boucle sur les modèles pour ajuster avec GridSearchCV et évaluation\nfor model_name, pipeline in pipelines.items():\n    print(f"Training {model_name}...")\n    grid_search = GridSearchCV(pipeline, parameters[model_name],  scoring=scoring, refit=\'RMSE\', cv=5, n_jobs =-1, verbose = 1)\n    grid_search.fit(X_train, y_train)\n    best_model = grid_search.best_estimator_\n    y_pred = best_model.predict(X_test)\n    mse = root_mean_squared_error(y_test, y_pred)\n    results[model_name] = mse\n    print(f"Best parameters: {grid_search.best_params_}")\n    print(f"Best {model_name} score: {-grid_search.best_score_}")\n    print(f"{model_name} MSE: {mse}")\n    print()\n'

In [None]:
'''
# Evaluation des modèles sur les données de test
print("\nComparaison des performances des modèles sur les données de test:")
for model_name, mse in results.items():
    print(f"{model_name}: MSE = {mse}")
    
'''

'\n# Evaluation des modèles sur les données de test\nprint("\nComparaison des performances des modèles sur les données de test:")\nfor model_name, mse in results.items():\n    print(f"{model_name}: MSE = {mse}")\n    \n'