In [None]:
#Imports

# +------------+
#  SUUUUUUUUUUU
# +------------+

#Imports
import time
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import _california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.ensemble import *

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

#Settings
plt.style.use('dark_background')

#Colonnes du dataset
"""
- MedInc        median income in block group
- HouseAge      median house age in block group
- AveRooms      average number of rooms per household
- AveBedrms     average number of bedrooms per household
- Population    block group population
- AveOccup      average number of household members
- Latitude      block group latitude
- Longitude     block group longitude
- MedHouseVal   median house value
"""

#Creation du dataframe avec le dataset
df = _california_housing.fetch_california_housing(as_frame=True).frame


In [None]:
#Variables

#Definition de la target
target_name = "MedHouseVal"
target = df[target_name]

#Colonnes exclues pour le test
columns_to_drop = [
    target_name, 
    "Population", 
    "AveOccup", 
    "AveBedrms", 
    "HouseAge", 
    "AveRooms"
]
data = df.drop(columns=columns_to_drop)

#Modeles a tester
models = [
    AdaBoostRegressor(),  
    BaggingRegressor(n_jobs=-1), 
    ExtraTreesRegressor(n_jobs=-1), 
    GradientBoostingRegressor(), 
    RandomForestRegressor(n_jobs=-1), 
    HistGradientBoostingRegressor()
]

linear_models = [
    LinearRegression(),
    Ridge(),
    RidgeCV(),
    SGDRegressor(),
    Lasso(),
]

#Nombre de training par modele
attemps = 5

## Fonctions

In [None]:
#Traitement des donnees

def TestModel(models):
    global attemps
    
    #Initialisation de la dataframe des resultats
    result_df = pd.DataFrame(columns=["ModelName","AvgScore","MaxScore","AvgExecTime"])

    for model in models:

        #Affichage
        model_name = model.__class__.__name__
        print(f"{model_name}")

        #Initialisation des listes
        score_results = []
        time_results = []
        
        for i in tqdm(range(attemps)):

            #Demarrage du timer
            start = time.time()

            #Split et Fit
            data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.25, random_state=69)
            model.fit(data_train, target_train)

            #Calcul du score
            score = model.score(data_test, target_test)

            #Sauvegarde des resultats dans les listes
            score_results.append(score)
            time_results.append(time.time() - start)

        #Calcul des stats
        avg_score = round(np.average(score_results) * 100, 3)
        max_score = round(np.max(score_results) * 100, 3)
        avg_time = round(np.average(time_results), 3)

        #Sauvegarde des resultats dans la dataframe
        new_row = pd.DataFrame([[model_name, avg_score, max_score, avg_time]], columns=["ModelName","AvgScore","MaxScore","AvgExecTime"])
        result_df = pd.concat([result_df, new_row])

    #Affichage
    print(f"-"*62, "\nResults :")
    print(result_df.to_string(index=False))
    print(f"-"*62)

In [None]:
#Learning Curve (Subplots MatPlotLib)

def subplots_learnings_curves(
    models,
    data,
    target
):
    n_jobs = -1
    fig, graphs = plt.subplots(1, len(models), figsize=(5* len(models), 5))
    points_amount = 10
    for i in tqdm(range(len(models))):
        graphs[i].set_title(models[i].__class__.__name__)
        graphs[i].set_xlabel("Training examples")
        graphs[i].set_ylabel("Score")

        train_sizes, train_scores, test_scores = learning_curve(
        estimator = models[i],
        X=data,
        y=target,
        n_jobs=n_jobs,
        train_sizes=np.linspace(0.1, 1, points_amount)
        )

        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)

        train_curve = graphs[i].plot(train_sizes, train_scores_mean)
        test_curve = graphs[i].plot(train_sizes, test_scores_mean)

    return plt

In [None]:
#Learning Curve (Plot Seaborn)

def plot_learning_curve(
    models,
    data,
    target
):
    n_jobs = -1
    points_amount = 10
    curves_results_df = pd.DataFrame(columns=["ModelName","TrainSize","TestScoreMean"])
    legend = []

    for i in tqdm(range(len(models))):
        train_sizes, train_scores, test_scores = learning_curve(
        estimator = models[i],
        X=data,
        y=target,
        n_jobs=n_jobs,
        train_sizes=np.linspace(0.1, 1, points_amount)
        )

        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)

        new_row = pd.DataFrame([[models[i].__class__.__name__, train_sizes, test_scores_mean]], columns=["ModelName","TrainSize","TestScoreMean"])
        curves_results_df = pd.concat([curves_results_df, new_row], ignore_index=True)

    for i in range(len(curves_results_df.index)):
        x_values = curves_results_df["TrainSize"][i]
        y_values = curves_results_df["TestScoreMean"][i]
        legend.append(models[i].__class__.__name__)
        sns.lineplot(x=x_values, y=y_values)
    
    plt.legend(labels=legend)

    return plt

## Calls

In [None]:
#Call Traitement des donnees

TestModel(models)
TestModel(linear_models)

In [None]:
#Call Learning Curve (Subplots MatPlotLib)

subplots_learnings_curves(
    models, 
    data,
    target
)

plt.show()

In [None]:
#Call Learning Curve (Plot Seaborn)

plot_learning_curve(
    models, 
    data,
    target
)

plt.show()