In [2]:
# +------------+
#  SUUUUUUUUUUU
# +------------+

#Imports
import time
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import _california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.ensemble import *

#Settings
plt.style.use('dark_background')

#Colonnes du dataset
"""
- MedInc        median income in block group
- HouseAge      median house age in block group
- AveRooms      average number of rooms per household
- AveBedrms     average number of bedrooms per household
- Population    block group population
- AveOccup      average number of household members
- Latitude      block group latitude
- Longitude     block group longitude
- MedHouseVal   median house value
"""

#Creation du dataframe avec le dataset
df = _california_housing.fetch_california_housing(as_frame=True).frame


In [None]:
#Heatmap

data = df.drop(columns=["Latitude", "Longitude"])
plt.figure(figsize=(12,7))
sns.heatmap(data.corr(),cbar=True,annot=True,cmap='Blues')

In [13]:
#Traitement des donnees

#Definition de la target
target_name = "MedHouseVal"
target = df[target_name]

#Colonnes exclues pour le test
columns_to_drop = [
    target_name, 
    "Population", 
    "AveOccup", 
    "AveBedrms", 
    "HouseAge", 
    "AveRooms"
]
data = df.drop(columns=columns_to_drop)

#Modeles a tester
models = [
    AdaBoostRegressor(), 
    BaggingRegressor(n_jobs=-1), 
    ExtraTreesRegressor(n_jobs=-1), 
    GradientBoostingRegressor(), 
    RandomForestRegressor(n_jobs=-1), 
    HistGradientBoostingRegressor()
]

linear_models = [
    LinearRegression(),
    Ridge(),
    RidgeCV(),
    SGDRegressor(),
    Lasso(),
]

#La fonction
def TestModel(models, attemps):

    #Initialisation de la dataframe des resultats
    result_df = pd.DataFrame(columns=["ModelName","AvgScore","MaxScore","AvgExecTime"])

    for model in models:

        #Affichage
        model_name = model.__class__.__name__
        print(f"{model_name}")

        #Initialisation des listes
        score_results = []
        time_results = []
        
        for i in tqdm(range(attemps)):

            #Demarrage du timer
            start = time.time()

            #Split et Fit
            data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.25, random_state=np.random.randint(0,100))
            model.fit(data_train, target_train)

            #Calcul du score
            score = model.score(data_test, target_test)

            #Sauvegarde des resultats dans les listes
            score_results.append(score)
            time_results.append(time.time() - start)

        #Calcul des stats
        avg_score = round(np.average(score_results) * 100, 3)
        max_score = round(np.max(score_results) * 100, 3)
        avg_time = round(np.average(time_results), 3)

        #Sauvegarde des resultats dans la dataframe
        new_row = pd.DataFrame([[model_name, avg_score, max_score, avg_time]], columns=["ModelName","AvgScore","MaxScore","AvgExecTime"])
        result_df = pd.concat([result_df, new_row])

    #Affichage
    print(f"-"*62, "\nResults :")
    print(result_df.to_string(index=False))
    print(f"-"*62)

#Call de la fonction
TestModel(models,30)
TestModel(linear_models,30)


AdaBoostRegressor


100%|██████████| 30/30 [00:05<00:00,  5.04it/s]


BaggingRegressor


100%|██████████| 30/30 [00:07<00:00,  3.97it/s]


ExtraTreesRegressor


100%|██████████| 30/30 [00:09<00:00,  3.14it/s]


GradientBoostingRegressor


100%|██████████| 30/30 [00:21<00:00,  1.39it/s]


RandomForestRegressor


100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


HistGradientBoostingRegressor


100%|██████████| 30/30 [00:10<00:00,  2.95it/s]


-------------------------------------------------------------- 
Results :
                    ModelName  AvgScore  MaxScore  AvgExecTime
            AdaBoostRegressor    40.507    48.096        0.198
             BaggingRegressor    81.164    82.623        0.251
          ExtraTreesRegressor    81.358    82.856        0.317
    GradientBoostingRegressor    76.138    77.688        0.720
        RandomForestRegressor    82.608    84.384        0.420
HistGradientBoostingRegressor    81.915    83.245        0.338
--------------------------------------------------------------
LinearRegression


100%|██████████| 30/30 [00:00<00:00, 201.33it/s]


Ridge


100%|██████████| 30/30 [00:00<00:00, 309.29it/s]


RidgeCV


100%|██████████| 30/30 [00:00<00:00, 206.90it/s]


SGDRegressor


100%|██████████| 30/30 [00:08<00:00,  3.47it/s]


Lasso


100%|██████████| 30/30 [00:00<00:00, 260.86it/s]

-------------------------------------------------------------- 
Results :
       ModelName      AvgScore      MaxScore  AvgExecTime
LinearRegression  5.816400e+01  6.052500e+01        0.005
           Ridge  5.851700e+01  6.038100e+01        0.003
         RidgeCV  5.824000e+01  6.038100e+01        0.005
    SGDRegressor -2.272657e+26 -8.896923e+25        0.288
           Lasso  2.646900e+01  2.718300e+01        0.004
--------------------------------------------------------------



