In [63]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
from scipy.stats import randint


from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split


In [64]:
# Importer le DataFrame propre depuis le fichier CSV
df = pd.read_csv('data.csv')

df.isnull().sum()

Zip                   0
Bank               1506
NAICS                 0
Term                  0
NewExist           1162
CreateJob             0
RetainedJob           0
FranchiseCode         0
UrbanRural       322826
RevLineCr        277255
LowDoc           277255
MIS_Status            0
SBA_Appv              0
dtype: int64

In [65]:
# Remplacer les valeurs non numériques par 0 et le type des variables
df['NewExist'] = df['NewExist'].fillna(0)
df['UrbanRural'] = df['UrbanRural'].fillna(0)


# Convertir la colonne en type entier
df['NewExist'] = df['NewExist'].astype(int)
df['NewExist'].astype(int)

df['UrbanRural'] = df['UrbanRural'].astype(int)
df['UrbanRural'].astype(int)

# supprimer la colonne 'bank' trop complexifiante ? 
df = df.drop(['Bank' ], axis=1)
df

Unnamed: 0,Zip,NAICS,Term,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,SBA_Appv
0,47711,451120,84,2,0,0,0,0,N,N,0,48000.0
1,46526,722410,60,2,0,0,0,0,N,N,0,32000.0
2,47401,621210,180,1,0,0,0,0,N,N,0,215250.0
3,74012,0,60,1,0,0,0,0,N,N,0,28000.0
4,32801,0,240,1,7,7,0,0,N,N,0,229000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
897162,43221,451120,60,1,0,0,0,0,,,0,56000.0
897163,43221,451130,60,1,0,0,0,0,Y,Y,0,42500.0
897164,93455,332321,108,1,0,0,0,0,N,N,0,225000.0
897165,96830,0,60,1,0,0,0,0,N,N,1,60000.0


In [78]:

#traiter la colonen NAICS pour qu'elle ne contienne que les 2 premiers chiffres des valeurs NAICS
df['NAICS_digit'] = (df['NAICS'] / 10000 ).astype(int)
df = df.drop(['NAICS'], axis=1)
df.head(2)

Unnamed: 0,Term,NewExist,FranchiseCode,UrbanRural,RevLineCr,LowDoc,MIS_Status,SBA_Appv,NAICS_digit
0,84,2,0,0,N,N,0,48000.0,45
1,60,2,0,0,N,N,0,32000.0,72


#### Modélisation

##### 1. Regression logistique

In [None]:
# Séparation des caractéristiques et de la cible
X = df.drop(columns=['MIS_Status'])
y = df['MIS_Status']

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition des colonnes catégorielles et numériques
cat_cols = ['RevLineCr', 'LowDoc']
num_cols = ['Zip', 'NAICS_digit', 'Term', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'SBA_Appv']

# Création des transformateurs pour les colonnes catégorielles et numériques
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Prétraitement des données avec ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols),
        ('num', num_transformer, num_cols)
    ])

# Création du pipeline avec régression logistique
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


#### 1. RandomForest sans hyperparamètres + SimpleImputer

In [79]:
# Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [80]:
df.columns

Index(['Term', 'NewExist', 'FranchiseCode', 'UrbanRural', 'RevLineCr',
       'LowDoc', 'MIS_Status', 'SBA_Appv', 'NAICS_digit'],
      dtype='object')

In [81]:
df = df.drop(['CreateJob', 'RetainedJob', 'Zip'], axis=1)

KeyError: "['CreateJob', 'RetainedJob', 'Zip'] not found in axis"

In [82]:
#separer dataset en features et target
X = df.drop(['MIS_Status'], axis=1)
y = df['MIS_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.05, random_state=42, stratify= y)

cat_col = ['NAICS_digit', 'NewExist','RevLineCr', 'LowDoc', 'FranchiseCode', 'UrbanRural']
num_col = ['Term', 'SBA_Appv' ]

X

Unnamed: 0,Term,NewExist,FranchiseCode,UrbanRural,RevLineCr,LowDoc,SBA_Appv,NAICS_digit
0,84,2,0,0,N,N,48000.0,45
1,60,2,0,0,N,N,32000.0,72
2,180,1,0,0,N,N,215250.0,62
3,60,1,0,0,N,N,28000.0,0
4,240,1,0,0,N,N,229000.0,0
...,...,...,...,...,...,...,...,...
897162,60,1,0,0,,,56000.0,45
897163,60,1,0,0,Y,Y,42500.0,45
897164,108,1,0,0,N,N,225000.0,33
897165,60,1,0,0,N,N,60000.0,0


In [83]:
# Test RandomForest sans hyperparamètres 
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop= 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(max_depth=50, n_estimators=150, min_samples_split=10))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

# Accéder à l'importance des caractéristiques
feature_importance = pipeline.named_steps['classifier'].feature_importances_

# Associer les importances aux noms des caractéristiques
feature_importance_dict = dict(zip(df.columns, feature_importance))

# Afficher les noms des caractéristiques et leurs importances
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

print('_____________________')
print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)
print('_____________________')

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)
print('_____________________')

print("Métrique pour le modèle RandomForest Simple")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))

Term: 0.009092005422919016
NewExist: 0.0013369959023449979
FranchiseCode: 0.0005226630111282156
UrbanRural: 0.00019449096181287022
RevLineCr: 0.0017450560682858386
LowDoc: 0.00097106039127088
MIS_Status: 0.0010655448438308876
SBA_Appv: 0.0017110076739835609
NAICS_digit: 0.0017400070724943074
_____________________
Score du modèle (train) : 0.9565661709147397
Score du modèle (test) : 0.9277068146860162
_____________________
_____________________
Métrique pour le modèle RandomForest Simple
Score d'accuracy 0.9277068146860162
Score du recall :  0.7538715410002539
Score de la precision :  0.8199641032721248
Score F1 :  0.7855300575358772

              precision    recall  f1-score   support

           0       0.95      0.96      0.96     36981
           1       0.82      0.75      0.79      7878

    accuracy                           0.93     44859
   macro avg       0.88      0.86      0.87     44859
weighted avg       0.93      0.93      0.93     44859



#### 2.KNNImputer

In [55]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# création du pipeline : prétraitement + modèle RandomForest
model_rfk = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(class_weight='balanced',
                                                                 max_depth=60,
                                                                 min_samples_split=4,
                                                                 n_estimators=110
                                                                 ))])

model_rfk.fit(X_train, y_train)

y_pred = model_rfk.predict(X_test)

# performance du modèle
score_tr = model_rfk.score(X_train, y_train)
score_te = model_rfk.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

# matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle RandomForest+KNNImputer")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))



Score du modèle (train) : 0.8221370678205532
Score du modèle (test) : 0.8151318575982524
Métrique pour le modèle RandomForest+KNNImputer
Score d'accuracy 0.8151318575982524
Score du recall :  0.7580604214267581
Score de la precision :  0.4832106157456105
Score F1 :  0.5902060582102091

              precision    recall  f1-score   support

           0       0.94      0.83      0.88     36981
           1       0.48      0.76      0.59      7878

    accuracy                           0.82     44859
   macro avg       0.71      0.79      0.74     44859
weighted avg       0.86      0.82      0.83     44859



In [None]:

# Accéder aux importances des caractéristiques à partir du modèle RandomForest entraîné
feature_importance = model_rfk.named_steps['classifier'].feature_importances_

# Obtenir les noms des caractéristiques après la transformation
cat_encoder = model_rfk.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
feature_names = list(cat_encoder.get_feature_names_out(cat_col)) + num_col

# Associer les importances aux noms des caractéristiques
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Afficher les importances des caractéristiques
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")


##### * Hyperparamétres du modéle combiné au KNNImputer

* Random Forest
     - max_depth
     - min_sample_split
     - max_leaf_nodes
     - min_samples_leaf
     - n_estimators
     - max_sample (bootstrap sample)
     - max_features


In [84]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'uniform')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# création du pipeline : prétraitement + modèle RandomForest
model_rfkp = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(class_weight='balanced', n_estimators=170, max_depth=50, min_samples_leaf=5, max_features='sqrt'))])

model_rfkp.fit(X_train, y_train)

# performance du modèle
score_tr = model_rfkp.score(X_train, y_train)
score_te = model_rfkp.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

# matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle RandomForest+KNNImputer")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))


Score du modèle (train) : 0.9184379355819727
Score du modèle (test) : 0.9015136316012394
Métrique pour le modèle RandomForest+KNNImputer
Score d'accuracy 0.9277068146860162
Score du recall :  0.7538715410002539
Score de la precision :  0.8199641032721248
Score F1 :  0.7855300575358772

              precision    recall  f1-score   support

           0       0.95      0.96      0.96     36981
           1       0.82      0.75      0.79      7878

    accuracy                           0.93     44859
   macro avg       0.88      0.86      0.87     44859
weighted avg       0.93      0.93      0.93     44859



#### 3.Iterative Imputer

In [33]:
#on conserve les meme hyperparamètres du RandomForest précédent mais on change la méthode d'imputation avec iterative imputer

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop= 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer (max_iter=20, random_state=0)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# création du pipeline : prétraitement + modèle RandomForest
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators= 150, max_depth=45, min_samples_leaf=6, max_features='sqrt'))])

model_rf.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)


# performance du modèle
score_tr = model_rf.score(X_train, y_train)
score_te = model_rf.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

# matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le RandomForestClassifieur avec hyperparamètres:")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))



Score du modèle (train) : 0.8981764808027145
Score du modèle (test) : 0.8956508170043915
Métrique pour le modèle AdaboostClassifierr
Score d'accuracy 0.8956508170043915
Score du recall :  0.46585427773546584
Score de la precision :  0.8858315230509293
Score F1 :  0.6105981199567423

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     36981
           1       0.89      0.47      0.61      7878

    accuracy                           0.90     44859
   macro avg       0.89      0.73      0.78     44859
weighted avg       0.89      0.90      0.88     44859



### Modélisation 

#### 1. Randomized search + RandomForest

In [9]:
#separer dataset en features et target
X = df.drop('MIS_Status', axis=1)
y = df['MIS_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.05, random_state=42, stratify= y)

cat_col = ['NAICS_digit', 'Zip', 'NewExist','CreateJob', 'RetainedJob', 'RevLineCr', 'LowDoc', 'FranchiseCode', 'UrbanRural', ]
num_col = ['Term', 'SBA_Appv' ]

In [56]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'distance')),
    ('scaler', StandardScaler())
])

In [38]:
# random search cv
hyper_grid = {'classifier__max_depth':list(np.arange(60, 100, step=10)) + [30],
              'classifier__n_estimators':[100],
              'classifier__max_features':randint(1,7),
              'classifier__min_samples_leaf':randint(1,4),
              'classifier__min_samples_split':np.arange(2, 10, step=2)
          }


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
model_rdmz_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

In [None]:
random_cv = RandomizedSearchCV(estimator= pipeline,
                               param_distributions=hyper_grid,
                               cv = 3,
                               n_iter= 9,
                               scoring = 'accuracy',
                               n_jobs= None,
                               return_train_score = True,
                               random_state = 42)

random_cv.fit(X_train, y_train)
y_pred = model_rdmz_rf.predict(X_test)

# Les meilleurs paramètres
print("Meilleurs paramètres:", random_cv.best_params_)

# Performance du meilleur modèle trouvé
score_tr = random_cv.best_estimator_.score(X_train, y_train)
score_te = random_cv.best_estimator_.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)
print(classification_report(y_test, y_pred))

RandomizedSearchCV donne les résultats suivants:
Meilleurs paramètres: {'classifier__max_depth': 70, 'classifier__max_features': 4, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 100}
Score du modèle (train) : 0.824524119140504
Score du modèle (test) : 0.8231104472953844

On peut restreindre la recherche d'hyperparamètres à des valeurs proches de ces résultats dans la gridsearchcv.


##### 2. GridSearch sur les hyperparamètres du RandomForest

In [65]:
# GridSearchCv
params = {'classifier__max_depth': [60, 70, 80],
              'classifier__n_estimators':[90, 100, 110],
              'classifier__max_features': [3, 4, 5],
              'classifier__min_samples_leaf':[1,2],
              'classifier__min_samples_split': [5,6,7]
          }


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

In [66]:
grid = GridSearchCV(pipeline, param_grid = params, scoring = 'accuracy', cv = 4)

grid.fit(X_train, y_train)

print("Meilleurs paramètres de GridSearch:", grid.best_params_)

# Performance du meilleur modèle trouvé
score_tr = grid.best_estimator_.score(X_train, y_train)
score_te = grid.best_estimator_.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)


La gridsearch a été interrompue malheureusement car elle a nécéssité énormément de ressources informatiques pour etre calculée.

#### Boosting

##### 1.Adaboost

In [87]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_sample_weight

'''#separer dataset en features et target
X = df.drop('MIS_Status', axis=1)
y = df['MIS_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.05, random_state=42, stratify= y)

cat_col = ['NAICS_digit', 'Zip', 'NewExist','CreateJob', 'RetainedJob', 'RevLineCr', 'LowDoc', 'FranchiseCode', 'UrbanRural', ]
num_col = ['Term', 'SBA_Appv' ]'''

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop= 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights='distance')),
    ('scaler', StandardScaler()),
])

In [88]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle RandomForest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', AdaBoostClassifier())])

sample_weights = compute_sample_weight(class_weight ='balanced', y= y_train)

pipeline.fit(X_train, y_train, classifier__sample_weight = sample_weights)
y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle AdaboostClassifierr")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))

Score du modèle (train) : 0.8527363347522258
Score du modèle (test) : 0.851668561492677
Métrique pour le modèle AdaboostClassifierr
Score d'accuracy 0.851668561492677
Score du recall :  0.8717948717948718
Score de la precision :  0.5489130434782609
Score F1 :  0.6736635605689063

              precision    recall  f1-score   support

           0       0.97      0.85      0.90     36981
           1       0.55      0.87      0.67      7878

    accuracy                           0.85     44859
   macro avg       0.76      0.86      0.79     44859
weighted avg       0.90      0.85      0.86     44859



#### XGBoost

In [89]:
from xgboost import XGBClassifier

weight_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle XGBoost
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', XGBClassifier(scale_pos_weight=weight_ratio, random_state=42))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle XGBClassifier")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))


Score du modèle (train) : 0.9032169121960606
Score du modèle (test) : 0.8999086025100872
Métrique pour le modèle XGBClassifier
Score d'accuracy 0.8999086025100872
Score du recall :  0.900990099009901
Score de la precision :  0.6567357512953368
Score F1 :  0.7597131542331158

              precision    recall  f1-score   support

           0       0.98      0.90      0.94     36981
           1       0.66      0.90      0.76      7878

    accuracy                           0.90     44859
   macro avg       0.82      0.90      0.85     44859
weighted avg       0.92      0.90      0.91     44859



Rappel de l'encodage pour la variable MIS_Status: {'P I F': 0, 'CHGOFF': 1}



In [90]:
from catboost import CatBoostClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import FeatureUnion

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'if_binary'))
])

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_col),
        ('num', num_transformer, num_col)
    ])

# Création du pipeline : prétraitement + modèle CatBoost
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', CatBoostClassifier(random_state=42))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# performance du modèle
score_tr = pipeline.score(X_train, y_train)
score_te = pipeline.score(X_test, y_test)

print("Score du modèle (train) :", score_tr)
print("Score du modèle (test) :", score_te)

#matrice confusion
conf_matrix = confusion_matrix(y_test, y_pred)

print("Métrique pour le modèle XGBClassifier")
print("Score d'accuracy", accuracy_score(y_test, y_pred))
print("Score du recall : ", recall_score(y_test, y_pred))
print("Score de la precision : ", precision_score(y_test, y_pred))
print("Score F1 : ", f1_score(y_test, y_pred))
print("")
print(classification_report(y_test, y_pred))


Learning rate set to 0.18378
0:	learn: 0.5201657	total: 217ms	remaining: 3m 36s
1:	learn: 0.4233822	total: 334ms	remaining: 2m 46s
2:	learn: 0.3612597	total: 447ms	remaining: 2m 28s
3:	learn: 0.3303949	total: 577ms	remaining: 2m 23s
4:	learn: 0.3081452	total: 716ms	remaining: 2m 22s
5:	learn: 0.2928329	total: 835ms	remaining: 2m 18s
6:	learn: 0.2746128	total: 994ms	remaining: 2m 20s
7:	learn: 0.2673302	total: 1.13s	remaining: 2m 20s
8:	learn: 0.2596492	total: 1.28s	remaining: 2m 21s
9:	learn: 0.2549779	total: 1.45s	remaining: 2m 23s
10:	learn: 0.2513641	total: 1.58s	remaining: 2m 22s
11:	learn: 0.2442544	total: 1.72s	remaining: 2m 21s
12:	learn: 0.2411490	total: 1.83s	remaining: 2m 19s
13:	learn: 0.2389480	total: 1.94s	remaining: 2m 16s
14:	learn: 0.2364985	total: 2.09s	remaining: 2m 17s
15:	learn: 0.2344247	total: 2.24s	remaining: 2m 17s
16:	learn: 0.2323947	total: 2.39s	remaining: 2m 18s
17:	learn: 0.2306159	total: 2.53s	remaining: 2m 17s
18:	learn: 0.2287855	total: 2.63s	remaining: 

#### Stacking/Voting

#### Sauvegarder le modèle

### Hyperparamétres du modéle

* Random Forest
     - max_depth
     - min_sample_split
     - max_leaf_nodes
     - min_samples_leaf
     - n_estimators
     - max_sample (bootstrap sample)
     - max_features


Pour sauvegarder le pipeline dans un fichier pickle après l'avoir entraîné et évalué, il faut suivre les étapes suivantes :

    Importer le module pickle.

    Utiliser la fonction dump de pickle pour enregistrer votre pipeline dans un fichier pickle.

In [None]:
import pickle

#enregistrer le modéle ici 'pipeline' dans un fichier pickle

with open('pipeline_rf.pickle', 'wb') as f:
    pickle.dump(pipeline, f)

In [None]:
file_path = 'mon fichier'

