In [None]:
# Importation des packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from category_encoders.target_encoder import TargetEncoder
from category_encoders.count import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor

from time import time

In [None]:
# Lecture du DataFrame
df = pd.read_csv('Pyrefighter_cleaned_database.csv', sep = ';')

# On conserve uniquement les données pour le 1er camion arrivé sur les lieux
df = df[(df["PumpOrder"] == 1)]

# Variables non pertinentes à supprimer
to_drop = ['IncidentNumber',
           'CalYear',
           'AddressQualifier',
           'FirstPumpArriving_DeployedFromStation',
           'PumpOrder']

# Sélection des variables qualitatives
categorical = ['PropertyCategory', 'IncGeo_BoroughName', 'IncidentStationGround',
               'DeployedFromStation_Name', 'DeployedFromLocation','HourMobilised', 'WeekdayMobilised', 'MonthMobilised']

# Retype des variables qualitatives
df[categorical] = df[categorical].astype(str)

# Sélection des 3 variables dont l'interaction est susceptible d'avoir un effet sur la variable cible
# Ne sera pas utilisé pour les modèles de type arbre
to_poly = ['HourMobilised', 'WeekdayMobilised', 'CityCenter']

In [None]:
# REGRESSION LINEAIRE - RECHERCHE DES PARAMETRES OPTIMAUX

# On sélection un échantillon de 10 000 lignes
df_sample = df.sample(n = 10000, random_state = 42)

# Pour chaque variable de categorial, on rassemble les catégories trop petites dans une catégorie 'Other'
for col in categorical:
    for value, valueCount in df_sample[col].value_counts().items():
        if valueCount < 100:
            df_sample[col] = df_sample[col].replace(value, 'Other')

# Séparation de le variable cible des variables explicatives
target_sample = df_sample['ResponseTimeMinute']
data_sample = df_sample.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data_sample, target_sample, test_size = 0.2, random_state = 1)

# Pipeline de test d'interaction des 3 variables sélectionnées
poly_features = Pipeline(steps = [("ohe", OneHotEncoder()),
    
                                  ("poly", PolynomialFeatures(include_bias = False,
                                                              interaction_only = True)),
                                  ("target", TargetEncoder())])

# Suppression des variables inutiles, application de la méthode PolynomialFeatures et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("poly_features", poly_features, to_poly),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

selector = SelectKBest()   
linreg = LinearRegression()

linreg_pipe = Pipeline([('preprocesser', preprocesser),
                        ('selection', selector),   
                        ('model', linreg)])

# Paramètres testés par le GridSearchCV
param_grid = {
    'model__fit_intercept' : [True, False],
    'preprocesser__poly_features__poly__degree' : [2, 3],
    'preprocesser__count_encode__min_group_size' : [10, 50, 100]
}

# Recherche des paramètres optimaux
grid = GridSearchCV(estimator = linreg_pipe, param_grid = param_grid, cv = 5, iid = True, n_jobs=-1)

grid.fit(X_train,y_train)  

print('Les meilleurs paramètres trouvés sont :', grid.best_params_) 

In [None]:
# REGRESSION LINEAIRE - EXECUTION DU MODELE AVEC LES PARAMETRES OPTIMAUX

# On mesure le temps d'exécution du modèle
t0 = time()

# Séparation de le variable cible des variables explicatives
target = df['ResponseTimeMinute']
data = df.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 1)

# Pipeline de test d'interaction des 3 variables sélectionnées       
poly_features = Pipeline(steps = [("ohe", OneHotEncoder()),
    
                                  ("poly", PolynomialFeatures(degree = 2,
                                                              include_bias = False,
                                                              interaction_only = True)),
                                  ("target", TargetEncoder())])

# Suppression des variables inutiles, application de la méthode PolynomialFeatures et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("poly_features", poly_features, to_poly),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

# Pamarètre optimaux sélectionnés
params = {'fit_intercept': False}
linreg = LinearRegression(**params)

linreg_pipe = Pipeline([('preprocesser', preprocesser),   
                        ('model', linreg)])

linreg_pipe.fit(X_train,y_train)  
y_pred = linreg_pipe.predict(X_test)
ResponseTimeMinute_mean = df['ResponseTimeMinute'].mean()

print('Score MSE train :', mean_squared_error(y_train, linreg_pipe.predict(X_train)))
print('Score MAE train (en minute) :', mean_absolute_error(y_train, linreg_pipe.predict(X_train)))
print('Score MSE test', mean_squared_error(y_test, linreg_pipe.predict(X_test)))
print('Score MAE test (en minute)', mean_absolute_error(y_test, linreg_pipe.predict(X_test)))
print("\nRelative test error:", mean_absolute_error(y_test, linreg_pipe.predict(X_test))/ResponseTimeMinute_mean)

t1 = time() - t0
print("Réalisé en {} secondes".format(round(t1,3)))

In [None]:
# RIDGE REGRESSOR - RECHERCHE DES PARAMETRES OPTIMAUX

# On sélection un échantillon de 10 000 lignes
df_sample = df.sample(n = 10000, random_state = 42)

# Pour chaque variable de categorial, on rassemble les catégories trop petites dans une catégorie 'Other'
for col in categorical:
    for value, valueCount in df_sample[col].value_counts().items():
        if valueCount < 100:
            df_sample[col] = df_sample[col].replace(value, 'Other')

# Séparation de le variable cible des variables explicatives
target_sample = df_sample['ResponseTimeMinute']
data_sample = df_sample.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data_sample, target_sample, test_size = 0.2, random_state = 1)

# Pipeline de test d'interaction des 3 variables sélectionnées
poly_features = Pipeline(steps = [("ohe", OneHotEncoder()),
    
                                  ("poly", PolynomialFeatures(degree = 2,
                                                              include_bias = False,
                                                              interaction_only = True)),
                                  ("target", TargetEncoder())])

# Suppression des variables inutiles, application de la méthode PolynomialFeatures et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("poly_features", poly_features, to_poly),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

ridge_reg = Ridge(random_state = 42)

ridge_reg_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', ridge_reg)])   

# Paramètres testés par le GridSearchCV
param_grid = {
    'preprocesser__poly_features__poly__degree' : [2, 3],
    'preprocesser__count_encode__min_group_size' : [10, 50, 100],
    'model__alpha': [float(x) for x in np.linspace(start = 0.000001, stop = 10, num = 10)]
}

grid = GridSearchCV(estimator = ridge_reg_pipe, param_grid = param_grid, cv = 5, iid = True, n_jobs=-1)

grid.fit(X_train, y_train)
print('Les meilleurs paramètres trouvés sont :', grid.best_params_)

In [None]:
# RIDGE REGRESSOR - EXECUTION DU MODELE AVEC LES PARAMETRES OPTIMAUX

# On mesure le temps d'exécution du modèle
t0 = time()

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
target = df['ResponseTimeMinute']
data = df.drop(['ResponseTimeMinute'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 1)

# Pipeline de test d'interaction des 3 variables sélectionnées
poly_features = Pipeline(steps = [("ohe", OneHotEncoder()),
    
                                  ("poly", PolynomialFeatures(degree = 2,
                                                              include_bias = False,
                                                              interaction_only = True)),
                                  ("target", TargetEncoder())])

# Suppression des variables inutiles, application de la méthode PolynomialFeatures et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("poly_features", poly_features, to_poly),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

# Pamarètre optimaux sélectionnés
params = {'alpha': 10.0, 'random_state':  42}
ridge_reg = Ridge(**params)

ridge_reg_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', ridge_reg)])   

ridge_reg_pipe.fit(X_train, y_train)
ResponseTimeMinute_mean = df['ResponseTimeMinute'].mean()

print('Score MSE train :', mean_squared_error(y_train, ridge_reg_pipe.predict(X_train)))
print('Score MAE train (en minute) :', mean_absolute_error(y_train, ridge_reg_pipe.predict(X_train)))
print('Score MSE test', mean_squared_error(y_test, ridge_reg_pipe.predict(X_test)))
print('Score MAE test (en minute)', mean_absolute_error(y_test, ridge_reg_pipe.predict(X_test)))
print("\nRelative test error:", mean_absolute_error(y_test, ridge_reg_pipe.predict(X_test))/ResponseTimeMinute_mean)

t1 = time() - t0
print("Réalisé en {} secondes".format(round(t1,3)))

In [None]:
# LASSO REGRESSOR - RECHERCHE DES PARAMETRES OPTIMAUX

# On sélection un échantillon de 10 000 lignes
df_sample = df.sample(n = 10000, random_state = 42)

# Pour chaque variable de categorial, on rassemble les catégories trop petites dans une catégorie 'Other'
for col in categorical:
    for value, valueCount in df_sample[col].value_counts().items():
        if valueCount < 100:
            df_sample[col] = df_sample[col].replace(value, 'Other')

# Séparation de le variable cible des variables explicatives
target_sample = df_sample['ResponseTimeMinute']
data_sample = df_sample.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data_sample, target_sample, test_size = 0.2, random_state = 1)

# Pipeline de test d'interaction des 3 variables sélectionnées
poly_features = Pipeline(steps = [("ohe", OneHotEncoder()),
    
                                  ("poly", PolynomialFeatures(degree = 2,
                                                              include_bias = False,
                                                              interaction_only = True)),
                                  ("target", TargetEncoder())])

# Suppression des variables inutiles, application de la méthode PolynomialFeatures et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("poly_features", poly_features, to_poly),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

lasso_reg = Lasso(random_state = 42)

lasso_reg_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', lasso_reg)])   

# Paramètres testés par le GridSearchCV
param_grid = {
    'preprocesser__poly_features__poly__degree' : [2, 3],
    'preprocesser__count_encode__min_group_size' : [10, 50, 100],
    'model__alpha': [float(x) for x in np.linspace(start = 0.000001, stop = 10, num = 10)]
}

grid = GridSearchCV(estimator = lasso_reg_pipe, param_grid = param_grid, cv = 5, iid = True, n_jobs=-1)

grid.fit(X_train, y_train)
print('Les meilleurs paramètres trouvés sont :', grid.best_params_) 

In [None]:
# LASSO REGRESSOR - EXECUTION DU MODELE AVEC LES PARAMETRES OPTIMAUX

# On mesure le temps d'exécution du modèle
t0 = time()

# Séparation de le variable cible des variables explicatives
target = df['ResponseTimeMinute']
data = df.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 1)

# Pipeline de test d'interaction des 3 variables sélectionnées
poly_features = Pipeline(steps = [("ohe", OneHotEncoder()),
    
                                  ("poly", PolynomialFeatures(degree = 2,
                                                              include_bias = False,
                                                              interaction_only = True)),
                                  ("target", TargetEncoder())])

# Suppression des variables inutiles, application de la méthode PolynomialFeatures et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("poly_features", poly_features, to_poly),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

# Pamarètre optimaux sélectionnés
params = {'alpha': 1e-06, 'random_state':  42}
lasso_reg = Lasso(**params)

lasso_reg_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', lasso_reg)])   

lasso_reg_pipe.fit(X_train, y_train)
ResponseTimeMinute_mean = df['ResponseTimeMinute'].mean()

print('Score MSE train :', mean_squared_error(y_train, lasso_reg_pipe.predict(X_train)))
print('Score MAE train (en minute) :', mean_absolute_error(y_train, lasso_reg_pipe.predict(X_train)))
print('Score MSE test', mean_squared_error(y_test, lasso_reg_pipe.predict(X_test)))
print('Score MAE test (en minute)', mean_absolute_error(y_test, lasso_reg_pipe.predict(X_test)))
print("\nRelative test error:", mean_absolute_error(y_test, lasso_reg_pipe.predict(X_test))/ResponseTimeMinute_mean)

t1 = time() - t0
print("Réalisé en {} secondes".format(round(t1,3)))

In [None]:
# GRADIENT BOOSTING REGRESSOR - RECHERCHE DES PARAMETRES OPTIMAUX

# On sélection un échantillon de 10 000 lignes
df_sample = df.sample(n = 10000, random_state = 42)

# Pour chaque variable de categorial, on rassemble les catégories trop petites dans une catégorie 'Other'
for col in categorical:
    for value, valueCount in df_sample[col].value_counts().items():
        if valueCount < 100:
            df_sample[col] = df_sample[col].replace(value, 'Other')

# Séparation de le variable cible des variables explicatives
target_sample = df_sample['ResponseTimeMinute']
data_sample = df_sample.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data_sample, target_sample, test_size = 0.2, random_state = 1)

# Suppression des variables inutiles et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

gbr = GradientBoostingRegressor(random_state = 42)

gbr_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', gbr)])   

# Paramètres testés par le GridSearchCV
param_grid = {
    'preprocesser__count_encode__min_group_size' : [10, 50, 100],
    'model__n_estimators': [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)],
    'model__max_features': ['auto'],
    'model__max_depth': [int(x) for x in np.linspace(0, 100, num = 10)],
    'model__min_samples_split': [1, 10, 100]
}

grid = GridSearchCV(estimator = gbr_pipe, param_grid = param_grid, cv = 5, iid = True, n_jobs=-1)
grid.fit(X_train, y_train)

print('Les meilleurs paramètres trouvés sont :', grid.best_params_) 

In [None]:
# GRADIENT BOOSTING REGRESSOR - EXECUTION DU MODELE AVEC LES PARAMETRES OPTIMAUX

# On mesure le temps d'exécution du modèle
t0 = time()

# Séparation de le variable cible des variables explicatives
target = df['ResponseTimeMinute']
data = df.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 1)

# Suppression des variables inutiles et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

# Pamarètre optimaux sélectionnés
params = {'max_depth': 11, 'max_features': 'auto', 'min_samples_split': 100, 'n_estimators': 10, 'random_state': 42}

gbr = GradientBoostingRegressor(**params)

gbr_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', gbr)])   

gbr_pipe.fit(X_train, y_train)
ResponseTimeMinute_mean = df['ResponseTimeMinute'].mean()

print('Score MSE train :', mean_squared_error(y_train, gbr_pipe.predict(X_train)))
print('Score MAE train (en minute) :', mean_absolute_error(y_train, gbr_pipe.predict(X_train)))
print('Score MSE test', mean_squared_error(y_test, gbr_pipe.predict(X_test)))
print('Score MAE test (en minute)', mean_absolute_error(y_test, gbr_pipe.predict(X_test)))
print("\nRelative test error:", mean_absolute_error(y_test, gbr_pipe.predict(X_test))/ResponseTimeMinute_mean)
t1 = time() - t0
print("Réalisé en {} secondes".format(round(t1,3)))

In [None]:
# Visualisation du modèle GRADIENT BOOSTING REGRESSOR

# Séparation de le variable cible des variables explicatives
target = df['ResponseTimeMinute']
data = df.drop(['ResponseTimeMinute'], axis = 1)

# Séparation du jeu de données en un ensemble d'entraînement et un ensemble test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 1)

# Suppression des variables inutiles et encodage des variables qualitatives
preprocesser = ColumnTransformer(transformers = [("drop_columns", 'drop', to_drop),
                                                 ("target_encode", TargetEncoder(), categorical),
                                                 ("count_encode", CountEncoder(min_group_size = 10), categorical)],
                                 remainder = 'drop')

# Pamarètre optimaux sélectionnés
params = {'max_depth': 11, 'max_features': 'auto', 'min_samples_split': 100, 'n_estimators': 10, 'random_state': 42}

gbr = GradientBoostingRegressor(**params)

gbr_pipe = Pipeline([('preprocesser', preprocesser),          
                     ('model', gbr)])   

gbr_pipe.fit(X_train, y_train)

y_pred_test = gbr_pipe.predict(X_test)

# On assemble nos données X_test et Y_test dans un DataFrame
df_y_pred_test = pd.DataFrame(y_pred_test)
df_ML = X_test.assign(ResponseTimeMinute = df_y_pred_test.values)

# On sélectionne 5 incidents
df_viz_pred = df_ML[(df_ML["IncidentNumber"] == "169070-18122017") | (df_ML["IncidentNumber"] == "010154-24012020") | (df_ML["IncidentNumber"] == "087083-16072020") | (df_ML["IncidentNumber"] == "066277-26052017") | (df_ML["IncidentNumber"] == "062086-17052017")]
df_viz_reel = df[((df["IncidentNumber"] == "169070-18122017") | (df["IncidentNumber"] == "010154-24012020") | (df["IncidentNumber"] == "087083-16072020") | (df["IncidentNumber"] == "066277-26052017") | (df["IncidentNumber"] == "062086-17052017"))]

# On créé notre graphique
barWidth = 0.4
x1 = range(5)
x2 = [r + barWidth for r in x1 ]

plt.bar(x1, df_viz_reel['ResponseTimeMinute'], color = 'green', label = 'Réel', width = barWidth)
plt.bar(x2, df_viz_pred['ResponseTimeMinute'], label = 'Prédit', width = barWidth)
plt.title('Visualisation des temps de réponse réels et prédits')
plt.ylabel('Temps de réponse (min)')
plt.xlabel('Incidents')
plt.legend();

# Enregistrement de la figure
fig = plt.gcf()
fig.savefig("ModèleGBR.png", dpi = 300, bbox_inches='tight')