## Imports & Variables

In [None]:
#Imports

import os
import time
import joblib
from tqdm import tqdm
from IPython.display import clear_output

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso

#Ensemble models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

#Other models
from sklearn.neighbors import KNeighborsRegressor

#Sklearn
from sklearn.datasets import _california_housing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Settings
plt.style.use('dark_background')

In [None]:
#Variables

#Creation du dataframe avec le dataset
dataset = _california_housing.fetch_california_housing(as_frame=True)
df = dataset.frame

#Definition de la target
target_name = "MedHouseVal"
target = df[target_name]

#Colonnes exclues pour le test
columns_to_drop = [
    target_name 
]
data = df.drop(columns=columns_to_drop)

#Split
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=69)

#Variables globales
global_random_state = 69

#Modèles à tester
linear_models = [
    LinearRegression(),
    Ridge(random_state=global_random_state),
    RidgeCV(),
    Lasso(random_state=global_random_state),
]

ens_models = [
    AdaBoostRegressor(random_state=global_random_state),  
    BaggingRegressor(random_state=global_random_state, n_jobs=-1), 
    ExtraTreesRegressor(random_state=global_random_state, n_jobs=-1), 
    GradientBoostingRegressor(random_state=global_random_state), 
    RandomForestRegressor(random_state=global_random_state, n_jobs=-1), 
    HistGradientBoostingRegressor(random_state=global_random_state)
]

boost_and_bag_models = [
    AdaBoostRegressor(base_estimator=BaggingRegressor(random_state=global_random_state, n_jobs=-1)),
    AdaBoostRegressor(base_estimator=ExtraTreesRegressor(random_state=global_random_state, n_jobs=-1)),
    AdaBoostRegressor(base_estimator=GradientBoostingRegressor(random_state=global_random_state)),
    AdaBoostRegressor(base_estimator=RandomForestRegressor(random_state=global_random_state, n_jobs=-1)),
    AdaBoostRegressor(base_estimator=HistGradientBoostingRegressor(random_state=global_random_state)),

    BaggingRegressor(base_estimator=BaggingRegressor(random_state=global_random_state, n_jobs=-1)),
    BaggingRegressor(base_estimator=ExtraTreesRegressor(random_state=global_random_state, n_jobs=-1)),
    BaggingRegressor(base_estimator=GradientBoostingRegressor(random_state=global_random_state)),
    BaggingRegressor(base_estimator=RandomForestRegressor(random_state=global_random_state, n_jobs=-1)),
    BaggingRegressor(base_estimator=HistGradientBoostingRegressor(random_state=global_random_state)),
]

## Fonctions

In [None]:
#Fonction qui retourne le score de chaque modèle d'un array
def evaluate_models(models):
    
    #Initialisation de la dataframe des resultats
    result_df = pd.DataFrame(columns=["ModelName","Score","ExecTime"])

    for model in models:

        #Demarrage du timer
        start = time.time()

        #Fit
        model.fit(data_train, target_train)

        #Calcul du score
        score = model.score(data_test, target_test)

        #Sauvegarde des stats dans la dataframe
        model_name = model.__class__.__name__
        new_row = pd.DataFrame([[model_name, score *100, time.time() - start]], columns=["ModelName","Score","ExecTime"])
        result_df = pd.concat([result_df, new_row])

    #Affichage
    print(f"-"*36, "\nResults :")
    print(result_df.to_string(index=False))
    print(f"-"*36)

In [None]:
#Fonctions pour afficher les mse & rmse d'un modèle d'un array en fonction du training set size

#Affiche chaque courbe sur le même plot
def draw_curve(mse_values, rmse_values, y_lim_min, y_lim_max, model_name, graph):

    train_size, mse_train_scores_mean, mse_test_scores_mean = mse_values
    train_size, rmse_train_scores_mean, rmse_test_scores_mean = rmse_values

    graph.set_title(model_name)
    graph.set_xlabel("Training set size")
    graph.set_ylabel("Error")

    graph.set_ylim(y_lim_min,y_lim_max)
    graph.plot(train_size, mse_train_scores_mean)
    graph.plot(train_size, mse_test_scores_mean)
    graph.plot(train_size, rmse_train_scores_mean)
    graph.plot(train_size, rmse_test_scores_mean)
    graph.legend(labels=["train_mse", "test_mse", "train_rmse", "test_rmse"], loc="upper right")

#Calcule les points de chaque courbe
def compute_learning_curve(model, scoring, point_amount):
    
    train_sizes, train_scores, test_scores = learning_curve(
                                                    random_state=global_random_state,
                                                    estimator = model,
                                                    X=data,
                                                    y=target,
                                                    scoring=scoring,
                                                    n_jobs=-1,
                                                    train_sizes=np.linspace(0.1, 1, point_amount)
                                                    )
    return train_sizes, -np.mean(train_scores, axis=1), -np.mean(test_scores, axis=1)

#Fonction initiale
def show_learning_curves(models, y_lim_min, y_lim_max, point_amount):
    
    #Creation du subplots
    plt.figure(1)
    fig, graphs = plt.subplots(1, len(models), figsize=(5* len(models), 5))

    #On boucle sur chaque modèle de l'array
    for i in tqdm(range(len(models))):
        draw_curve(
            compute_learning_curve(models[i], "neg_mean_squared_error", point_amount),
            compute_learning_curve(models[i], "neg_root_mean_squared_error", point_amount),
            y_lim_min,
            y_lim_max,
            models[i].__class__.__name__, 
            graphs[i]
            )

    plt.show()


In [None]:
#Fonction qui fit et score
def fit_and_score(alg):
    alg.fit(data_train, target_train)
    result_score = alg.score(data_test, target_test) * 100
    print("R2 : " + str(result_score))
    #return result_score

In [None]:
#Fonction qui calcule l'erreur
def evaluate_error(model):
    r = np.absolute(model.predict(data) - target)*100000
    print(str(np.mean(r)) + " +- " + str(np.std(r)))

In [None]:
#Fonction qui sauvegarde l'IA entrainée

def save_ai(model, file_name):

    with open(f"Saved AI/{file_name}.joblib", "wb") as fo:
        joblib.dump(model, fo)


In [None]:
#Fonction qui charge l'IA entrainée

def load_ai(file_name):
    with open(f"Saved AI/{file_name}.joblib", "rb") as fo:
        return joblib.load(fo)

## I - Début de l'analyse des données

### 1) Premiers essais

In [None]:
#RandomForestClassifier

modele_rf = RandomForestClassifier(
    # il s’agit du nombre d’arbres dans la forêt
    n_estimators=5,
    # il s’agit du critère utilisé pour construire les arbres et séparer les branches des arbres
    criterion='gini',
    # il s’agit de la profondeur maximale des arbres utilisés (le nombre de niveaux dans l’arbre de décision)
    max_depth=None,
    # il s’agit du nombre d’échantillons minimal dans une feuille pour refaire une séparation
    min_samples_split=2,
    # il s’agit du nombre d’échantillons minimal pour créer une feuille
    min_samples_leaf=1,
    # il s’agit de la fraction du nombre total d’échantillon minimal pour créer une feuille
    min_weight_fraction_leaf=0.0,
    # il s’agit du nombre maximal de feuilles
    max_leaf_nodes=None,
    # il s’agit de la baisse minimale du critère d’impureté pour faire une séparation
    min_impurity_decrease=0.0,
    # paramètre pour utiliser du bootstrap, si il est à False, le même échantillon est pris pour chaque arbre
    bootstrap=True,
    # ??
    oob_score=False,
    # nombre de traitements à effectuer en parallèle
    n_jobs=None,
    # graine aléatoire
    random_state=None,
    # ??
    verbose=0,
    # ceci permet de repartir du résultat du dernier apprentissage pour faire l’apprentissage
    warm_start=False,
    # il s’agit des poids associés à chaque classe si cela a un sens
    class_weight=None,
    # ??
    ccp_alpha=0.0,
    # si vous voulez réduire le nombre d’observations dans vos échantillons bootstrap
    max_samples=None,
)

#modele_rf.fit(data, target)

In [None]:
#KNeighborsRegressor

#Colonnes exclues pour le train
KNeighborsRegressor_data = df.drop(columns=[target_name, "AveOccup", "Population", "HouseAge", "AveBedrms" , "AveRooms"])

#Entrainement
KNeighborsRegressor_model = KNeighborsRegressor()
KNeighborsRegressor_model.fit(KNeighborsRegressor_data, target)

target_predicted = KNeighborsRegressor_model.predict(KNeighborsRegressor_data)
print(target[:5])
print(target_predicted[:5])
print(f"Number of correct prediction: "
      f"{(target[:5] == target_predicted[:5]).sum()} / 5")

### 2) Modèles linéaires

In [None]:
#Test de tous les modèles linéaires

evaluate_models(linear_models)

In [None]:
#MSE et RMSE de tous les modèles linéaires

show_learning_curves(linear_models, y_lim_min=-0.5, y_lim_max=2, point_amount=50)

### 3) Ensembles

In [None]:
#Test de tous les modèles ensemblistes

evaluate_models(ens_models)

In [None]:
#MSE et RMSE de tous les modèles ensemblistes

show_learning_curves(ens_models, y_lim_min=0, y_lim_max=2, point_amount=10)

## III - Recherche des meilleurs hyperparamètres

### 1) Optimisation du HistGradientBoostingRegressor

In [None]:
#Score du modèle de base

base_model = HistGradientBoostingRegressor(
    random_state=global_random_state,
)

fit_and_score(base_model)

In [None]:
#Recherche de "max_iter", "max_depth", "learning_rate", "min_samples_leaf", "max_depth"

model = HistGradientBoostingRegressor(
    random_state=global_random_state
)

parameters = {
    "max_iter":range(10,401,10),
    "max_depth":range(5,31,2),
    "learning_rate":[0.1,1],
    "min_samples_leaf":range(10, 121, 10),
    "max_leaf_nodes":range(20,41,2)
}

best_model = RandomizedSearchCV(estimator=model, param_distributions=parameters, n_iter=2000, n_jobs=-1)

#Fit
fit_and_score(best_model)

#Affichage de l'erreur
evaluate_error(best_model.best_estimator_)

#Stockage pour l'AdaBoostRegressor
best_hist = best_model.best_estimator_

#Affichage
print(best_model.best_params_)

### 2) Optimisation du AdaBoostRegressor(HistGradientBoostingRegressor)

In [None]:
#Score de base de la combinaison de modèles

base_boosted_model = AdaBoostRegressor(
    random_state=global_random_state, 
    base_estimator=best_hist
)

#Fit
fit_and_score(base_boosted_model)

#Affichage de l'erreur
evaluate_error(base_boosted_model)


In [None]:
#Recherche "n_estimators" & "learning_rate"
boosted_model = AdaBoostRegressor(random_state=global_random_state,
                                    base_estimator=best_hist,
                                    )

parameters = {
    "n_estimators":range(20, 200, 10),
    "learning_rate":[0.1, 1]
}

best_boosted_model = RandomizedSearchCV(estimator=boosted_model, param_distributions=parameters, n_iter=30, n_jobs=-1)

#Fit
best_boosted_model.fit(data_train, target_train)

#Affichage du score
final_score = best_boosted_model.score(data_test, target_test) * 100
print("R2 : " + str(final_score))

#Affichage de l'erreur
evaluate_error(best_boosted_model.best_estimator_)

#Affichage
print(best_boosted_model.best_params_)

## IV - Sauvegarde de l'IA

In [None]:
#Sauvegarde

save_ai(best_boosted_model, file_name=str(final_score))

## V - Test de l'IA

In [None]:
#Test

#Chargement de l'IA sauvegardée
ai = load_ai(str(final_score))

#Predict
predicted = ai.predict(data)
d = {'Predicted': predicted, 'Original': target, 'Ecart': np.absolute(predicted-target)*100000}

#Stockage et affichage du resultat
results_dataframe = pd.DataFrame(data=d)
print(results_dataframe)

#Affichage des stats
print(np.min(results_dataframe["Ecart"]))
print(np.average(results_dataframe["Ecart"]))
print(np.median(results_dataframe["Ecart"]))