In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy.stats import randint

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split


In [2]:
# Importer le DataFrame propre depuis le fichier CSV
df = pd.read_csv('data.csv')

df.isnull().sum()

Zip                   0
Bank               1506
NAICS                 0
Term                  0
NewExist           1162
CreateJob             0
RetainedJob           0
FranchiseCode         0
UrbanRural       322826
RevLineCr        277255
LowDoc           277255
MIS_Status            0
SBA_Appv              0
dtype: int64

In [3]:
# Remplacer les valeurs non numériques par 0 et le type des variables
df['NewExist'] = df['NewExist'].fillna(0)
df['UrbanRural'] = df['UrbanRural'].fillna(0)


# Convertir la colonne en type entier
df['NewExist'] = df['NewExist'].astype(int)
df['NewExist'].astype(int)

df['UrbanRural'] = df['UrbanRural'].astype(int)
df['UrbanRural'].astype(int)

df.dtypes

Zip                int64
Bank              object
NAICS              int64
Term               int64
NewExist           int64
CreateJob          int64
RetainedJob        int64
FranchiseCode      int64
UrbanRural         int64
RevLineCr         object
LowDoc            object
MIS_Status         int64
SBA_Appv         float64
dtype: object

In [4]:

#traiter la colonen NAICS pour qu'elle ne contienne que les 2 premiers chiffres des valeurs NAICS
df['NAICS_digit'] = (df['NAICS'] / 10000 ).astype(int)
df = df.drop(['NAICS'], axis=1)
df.head(2)

Unnamed: 0,Zip,Bank,Term,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,SBA_Appv,NAICS_digit
0,47711,FIFTH THIRD BANK,84,2,0,0,0,0,N,N,0,48000.0,45
1,46526,1ST SOURCE BANK,60,2,0,0,0,0,N,N,0,32000.0,72


### Modélisation

#### 1. RandomForest sans hyperparamètres + SimpleImputer

In [5]:
# Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [6]:
df.columns

Index(['Zip', 'Bank', 'Term', 'NewExist', 'CreateJob', 'RetainedJob',
       'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'MIS_Status',
       'SBA_Appv', 'NAICS_digit'],
      dtype='object')

In [19]:
#separer dataset en features et target
X = df.drop('MIS_Status', axis=1)
y = df['MIS_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.05, random_state=42, stratify= y)

cat_col = ['NAICS_digit', 'Zip', 'NewExist','CreateJob', 'RetainedJob', 'RevLineCr', 'LowDoc', 'FranchiseCode', 'UrbanRural']
num_col = ['Term', 'SBA_Appv' ]

In [13]:
# Test RandomForest sans hyperparamètres 
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop= 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

# Accéder à l'importance des caractéristiques
feature_importance = pipeline.named_steps['classifier'].feature_importances_

# Associer les importances aux noms des caractéristiques
feature_importance_dict = dict(zip(df.columns, feature_importance))

# Afficher les noms des caractéristiques et leurs importances
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

print('_____________________')
print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)
print('_____________________')

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)
print('_____________________')

print("Métrique pour le modèle RandomForest Simple")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))

#### 2.KNNImputer

In [13]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'uniform')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# création du pipeline : prétraitement + modèle RandomForest
model_rfk = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(class_weight='balanced'))])

model_rfk.fit(X_train, y_train)

y_pred = model_rfk.predict(X_test)

# performance du modèle
score_tr = model_rfk.score(X_train, y_train)
score_te = model_rfk.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

# matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle RandomForest+KNNImputer")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))



Score du modèle (train) : 0.9999777075892753
Score du modèle (test) : 0.9215542031699324
Métrique pour le modèle RandomForest+KNNImputer
Score d'accuracy 0.9215542031699324
Score du recall :  0.6474993653211475
Score de la precision :  0.8730104398425467
Score F1 :  0.7435318125501058

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     36981
           1       0.87      0.65      0.74      7878

    accuracy                           0.92     44859
   macro avg       0.90      0.81      0.85     44859
weighted avg       0.92      0.92      0.92     44859



In [18]:

# Accéder aux importances des caractéristiques à partir du modèle RandomForest entraîné
feature_importance = model_rfk.named_steps['classifier'].feature_importances_

# Obtenir les noms des caractéristiques après la transformation
cat_encoder = model_rfk.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
feature_names = list(cat_encoder.get_feature_names_out(cat_col)) + num_col

# Associer les importances aux noms des caractéristiques
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Afficher les importances des caractéristiques
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")


Bank_1ST & FARMERS BK HOLDING CO.: 1.0723016237228593e-08
Bank_1ST BANK: 5.472634057082375e-06
Bank_1ST BANK & TRUST: 2.323762107776273e-06
Bank_1ST BANK, A DIVISION OF: 1.7156083081689098e-05
Bank_1ST BANK, NATIONAL ASSOCIATION: 6.502713464652889e-06
Bank_1ST BK & TR CO: 1.6586167580887677e-05
Bank_1ST CAMERON STATE BANK: 3.6608799167654395e-06
Bank_1ST CAP. BK OF KENTUCKY: 1.1306002745308289e-05
Bank_1ST CAPITAL BANK: 1.1174745330433926e-07
Bank_1ST CENTENNIAL BANK: 3.2850684990445664e-05
Bank_1ST CHOICE BANK: 7.994129962809722e-09
Bank_1ST CIT. NATL BK OF UPPER SAN: 1.2165917102944185e-05
Bank_1ST COLONIAL COMMUNITY BANK: 9.956806727850345e-08
Bank_1ST COMMERCIAL CAPITAL: 2.5444248584737676e-05
Bank_1ST COMMONWEALTH BK OF PRESTO: 3.2680374602021336e-06
Bank_1ST COMMUNITY BANK: 7.241551251786935e-08
Bank_1ST COMMUNITY FCU: 6.312817769840416e-08
Bank_1ST CONSTITUTION BANK: 4.808649040767954e-05
Bank_1ST FARMERS NATL BK OF WAURIK: 3.672153309321216e-06
Bank_1ST FINANCIAL BANK USA: 1.16

##### * Hyperparamétres du modéle combiné au KNNImputer

* Random Forest
     - max_depth
     - min_sample_split
     - max_leaf_nodes
     - min_samples_leaf
     - n_estimators
     - max_sample (bootstrap sample)
     - max_features


In [20]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'distance')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# création du pipeline : prétraitement + modèle RandomForest
model_rfkp = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(class_weight='balanced', n_estimators=80, max_depth=40, min_samples_leaf=5, max_features='sqrt'))])

model_rfkp.fit(X_train, y_train)

# performance du modèle
score_tr = model_rfkp.score(X_train, y_train)
score_te = model_rfkp.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

# matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle RandomForest+KNNImputer")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))




Score du modèle (train) : 0.7749768862899328
Score du modèle (test) : 0.7717737800664304
Métrique pour le modèle RandomForest+KNNImputer
Score d'accuracy 0.9215542031699324
Score du recall :  0.6474993653211475
Score de la precision :  0.8730104398425467
Score F1 :  0.7435318125501058

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     36981
           1       0.87      0.65      0.74      7878

    accuracy                           0.92     44859
   macro avg       0.90      0.81      0.85     44859
weighted avg       0.92      0.92      0.92     44859



#### 3.Iterative Imputer

In [33]:
#on conserve les meme hyperparamètres du RandomForest précédent mais on change la méthode d'imputation avec iterative imputer

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop= 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer (max_iter=10, random_state=0)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# création du pipeline : prétraitement + modèle RandomForest
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators= 80, max_depth=40, min_samples_leaf=5, max_features='sqrt'))])

model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)


# performance du modèle
score_tr = model_rf.score(X_train, y_train)
score_te = model_rf.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

# matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le RandomForestClassifieur avec hyperparamètres:")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))



Score du modèle (train) : 0.8981764808027145
Score du modèle (test) : 0.8956508170043915
Métrique pour le modèle AdaboostClassifierr
Score d'accuracy 0.8956508170043915
Score du recall :  0.46585427773546584
Score de la precision :  0.8858315230509293
Score F1 :  0.6105981199567423

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     36981
           1       0.89      0.47      0.61      7878

    accuracy                           0.90     44859
   macro avg       0.89      0.73      0.78     44859
weighted avg       0.89      0.90      0.88     44859



### Modélisation 

#### 1. Randomized search + RandomForest

In [9]:
#separer dataset en features et target
X = df.drop('MIS_Status', axis=1)
y = df['MIS_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.05, random_state=42, stratify= y)

cat_col = ['Bank', 'NAICS_digit', 'Zip', 'NewExist','CreateJob', 'RetainedJob', 'RevLineCr', 'LowDoc', 'FranchiseCode', 'UrbanRural', ]
num_col = ['Term', 'SBA_Appv' ]

In [56]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'uniform')),
    ('scaler', StandardScaler())
])

In [38]:
# random search cv
hyper_grid = {'classifier__max_depth':list(np.arange(60, 100, step=10)) + [30],
              'classifier__n_estimators':[100],
              'classifier__max_features':randint(1,7),
              'classifier__min_samples_leaf':randint(1,4),
              'classifier__min_samples_split':np.arange(2, 10, step=2)
          }


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
model_rdmz_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

In [None]:
random_cv = RandomizedSearchCV(estimator= pipeline,
                               param_distributions=hyper_grid,
                               cv = 3,
                               n_iter= 9,
                               scoring = 'accuracy',
                               n_jobs= None,
                               return_train_score = True,
                               random_state = 42)

random_cv.fit(X_train, y_train)
y_pred = model_rdmz_rf.predict(X_test)

# Les meilleurs paramètres
print("Meilleurs paramètres:", random_cv.best_params_)

# Performance du meilleur modèle trouvé
score_tr = random_cv.best_estimator_.score(X_train, y_train)
score_te = random_cv.best_estimator_.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)
print(classification_report(y_test, y_pred))

RandomizedSearchCV donne les résultats suivants:
Meilleurs paramètres: {'classifier__max_depth': 70, 'classifier__max_features': 4, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 100}
Score du modèle (train) : 0.824524119140504
Score du modèle (test) : 0.8231104472953844

On peut restreindre la recherche d'hyperparamètres à des valeurs proches de ces résultats dans la gridsearchcv.


#####2. GridSearch sur les hyperparamètres du RandomForest

In [65]:
# GridSearchCv
params = {'classifier__max_depth': [60, 70, 80],
              'classifier__n_estimators':[90, 100, 110],
              'classifier__max_features': [3, 4, 5],
              'classifier__min_samples_leaf':[1,2],
              'classifier__min_samples_split': [5,6,7]
          }


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

In [66]:
grid = GridSearchCV(pipeline, param_grid = params, scoring = 'accuracy', cv = 4)

grid.fit(X_train, y_train)

print("Meilleurs paramètres de GridSearch:", grid.best_params_)

# Performance du meilleur modèle trouvé
score_tr = grid.best_estimator_.score(X_train, y_train)
score_te = grid.best_estimator_.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)


La gridsearch a été interrompue malheureusement car elle a nécéssité énormément de ressources informatiques pour etre calculée.

#### Boosting

##### 1.Adaboost

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report

#separer dataset en features et target
X = df.drop('MIS_Status', axis=1)
y = df['MIS_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.05, random_state=42, stratify= y)

cat_col = ['Bank', 'NAICS_digit', 'Zip', 'NewExist','CreateJob', 'RetainedJob', 'RevLineCr', 'LowDoc', 'FranchiseCode', 'UrbanRural', ]
num_col = ['Term', 'SBA_Appv' ]

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop= 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [39]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', AdaBoostClassifier(class_weight='balanced'))])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle AdaboostClassifierr")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))

Score du modèle (train) : 0.8935711189547341
Score du modèle (test) : 0.894044606930682
Métrique pour le modèle XGBClassifier
Score d'accuracy 0.894044606930682
Score du recall :  0.5917454316320101
Score de la precision :  0.7562409405701401
Score F1 :  0.663956447963801

              precision    recall  f1-score   support

           0       0.92      0.96      0.94     73847
           1       0.76      0.59      0.66     15870

    accuracy                           0.89     89717
   macro avg       0.84      0.78      0.80     89717
weighted avg       0.89      0.89      0.89     89717



#### XGBoost

In [11]:
from xgboost import XGBClassifier

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors =2, weights = 'distance')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle XGBoost
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', XGBClassifier(random_state=42))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle XGBClassifier")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))




Score du modèle (train) : 0.930283418670246
Score du modèle (test) : 0.9302035266055864
Métrique pour le modèle XGBClassifier
Score d'accuracy 0.9302035266055864
Score du recall :  0.765041888804265
Score de la precision :  0.8248255097851376
Score F1 :  0.793809680605861

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     36981
           1       0.82      0.77      0.79      7878

    accuracy                           0.93     44859
   macro avg       0.89      0.87      0.88     44859
weighted avg       0.93      0.93      0.93     44859





Rappel de l'encodage pour la variable MIS_Status: {'P I F': 0, 'CHGOFF': 1}



In [9]:
from catboost import CatBoostClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'distance')),
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle CatBoost
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', CatBoostClassifier(random_state=42))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle XGBClassifier")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))


Learning rate set to 0.18378
0:	learn: 0.5119929	total: 51ms	remaining: 51s
1:	learn: 0.4219011	total: 99.3ms	remaining: 49.5s
2:	learn: 0.3610536	total: 149ms	remaining: 49.4s
3:	learn: 0.3323786	total: 198ms	remaining: 49.2s
4:	learn: 0.2975453	total: 247ms	remaining: 49.2s
5:	learn: 0.2841054	total: 295ms	remaining: 48.8s
6:	learn: 0.2749412	total: 341ms	remaining: 48.4s
7:	learn: 0.2660886	total: 391ms	remaining: 48.5s
8:	learn: 0.2577706	total: 440ms	remaining: 48.4s
9:	learn: 0.2497189	total: 486ms	remaining: 48.2s
10:	learn: 0.2444246	total: 537ms	remaining: 48.3s
11:	learn: 0.2400550	total: 580ms	remaining: 47.8s
12:	learn: 0.2370570	total: 625ms	remaining: 47.5s
13:	learn: 0.2328601	total: 669ms	remaining: 47.2s
14:	learn: 0.2294426	total: 717ms	remaining: 47.1s
15:	learn: 0.2270420	total: 758ms	remaining: 46.6s
16:	learn: 0.2248794	total: 802ms	remaining: 46.4s
17:	learn: 0.2232294	total: 843ms	remaining: 46s
18:	learn: 0.2216881	total: 882ms	remaining: 45.5s
19:	learn: 0.219



Score du modèle (train) : 0.9344614857539763
Score du modèle (test) : 0.934060054838494
Métrique pour le modèle XGBClassifier
Score d'accuracy 0.934060054838494
Score du recall :  0.7885250063467886
Score de la precision :  0.8278251599147122
Score F1 :  0.8076973085424523

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     36981
           1       0.83      0.79      0.81      7878

    accuracy                           0.93     44859
   macro avg       0.89      0.88      0.88     44859
weighted avg       0.93      0.93      0.93     44859





#### Stacking/Voting

#### Sauvegarder le modèle

Pour sauvegarder le pipeline dans un fichier pickle après l'avoir entraîné et évalué, il faut suivre les étapes suivantes :

    Importer le module pickle.

    Utiliser la fonction dump de pickle pour enregistrer votre pipeline dans un fichier pickle.

In [None]:
import pickle

#enregistrer le modéle ici 'pipeline' dans un fichier pickle

with open('pipeline_rf.pickle', 'wb') as f:
    pickle.dump(pipeline, f)

In [None]:
file_path = 'mon fichier'

