# P7 Machine Learning Phase 1

# 0 Chargement des bibliothèques et des données

##  0.1 Chargement des bibliothèques

In [1]:
# Import of pandas, numpy, pyplot and seaborn libraries

# Datasets analysis libraries 
import pandas as pd
import numpy as np

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

# math library
import math
import statistics

# warning management library
import warnings

# DOS like library
import os

# Date management
from datetime import *
from dateutil.relativedelta import *
from dateutil.parser import *

# ML
import sklearn

## 0.2 Settings

In [2]:
warnings.filterwarnings('ignore')
sns.set()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## 0.3 Chargement des données

In [3]:
path_transformed='/home/emma_sylvain/Formation_OpenClassrooms_DataScientist/Projet_7/transformed//'

In [4]:
csv_train1=path_transformed+'df_train1.csv'
df=pd.read_csv(csv_train1)
df

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY
0,1,0,202500.0,406597.5,24700.5,351000.0,2,2
1,0,0,270000.0,1293502.5,35698.5,1129500.0,1,1
2,0,0,67500.0,135000.0,6750.0,135000.0,2,2
3,0,0,135000.0,312682.5,29686.5,297000.0,2,2
4,0,0,121500.0,513000.0,21865.5,513000.0,2,2
...,...,...,...,...,...,...,...,...
307473,0,0,157500.0,254700.0,27558.0,225000.0,1,1
307474,0,0,72000.0,269550.0,12001.5,225000.0,2,2
307475,0,0,153000.0,677664.0,29979.0,585000.0,3,3
307476,1,0,171000.0,370107.0,20205.0,319500.0,2,2


In [5]:
df.columns

Index(['TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY'], dtype='object')

# 1 Machine Learning Phase 1

# 1.1 Utilisation de Stratified KFold pour préserver le nombre de classes par échantillon

In [6]:
from sklearn.model_selection import StratifiedKFold

In [7]:
skf=StratifiedKFold(n_splits=2)

In [8]:
select_features=['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY']

In [9]:
X=df[select_features]

In [10]:
len(X)

307478

In [11]:
X=X.reset_index(drop=True)

In [12]:
X

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY
0,0,202500.0,406597.5,24700.5,351000.0,2,2
1,0,270000.0,1293502.5,35698.5,1129500.0,1,1
2,0,67500.0,135000.0,6750.0,135000.0,2,2
3,0,135000.0,312682.5,29686.5,297000.0,2,2
4,0,121500.0,513000.0,21865.5,513000.0,2,2
...,...,...,...,...,...,...,...
307473,0,157500.0,254700.0,27558.0,225000.0,1,1
307474,0,72000.0,269550.0,12001.5,225000.0,2,2
307475,0,153000.0,677664.0,29979.0,585000.0,3,3
307476,0,171000.0,370107.0,20205.0,319500.0,2,2


In [13]:
y=df['TARGET']

In [14]:
len(y)

307478

In [15]:
y=y.reset_index(drop=True)

In [16]:
y

0         1
1         0
2         0
3         0
4         0
         ..
307473    0
307474    0
307475    0
307476    1
307477    0
Name: TARGET, Length: 307478, dtype: int64

In [17]:
X=np.array(X)

In [18]:
y=np.array(y)

In [19]:
for train_index, test_index in skf.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

### Premier essai avec un RF Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier 

In [21]:
rfc=RandomForestClassifier(max_depth=10,random_state=0)

In [22]:
result=rfc.fit(X_train,y_train)

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
y_pred=rfc.predict(X_test)

In [25]:
accuracy=accuracy_score(y_test, y_pred)
accuracy

0.9192722731382408

In [26]:
score=rfc.score(X_test, y_test)
score

0.9192722731382408

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
conf_mat=confusion_matrix(y_test,y_pred)
conf_mat

array([[141328,      1],
       [ 12410,      0]])

In [29]:
from sklearn.metrics import roc_auc_score, f1_score

In [30]:
auroc=roc_auc_score(y_test,y_pred)
auroc

0.4999964621556793

In [31]:
f1_sc=f1_score(y_test,y_pred)
f1_sc

0.0

### Le modèle trouve beaucoup de faux négatifs. Modèle noyé sous 0. Travailler sur imbalanced classes.

## 1.2 Essai avec une gridsearchCV (StratifiedKFolds utilisé) puis RF

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
grid=GridSearchCV(rfc,{},cv=5,scoring='roc_auc')

In [34]:
result=grid.fit(X,y)

In [35]:
yhat = grid.predict(X)
print("Accuracy: ",accuracy_score(y, yhat))

Accuracy:  0.9192917867294571


In [36]:
auroc=roc_auc_score(y,yhat)
auroc

0.5000805801772764

In [37]:
f1_sc=f1_score(y,yhat)
f1_sc

0.00032226877215597806

In [38]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,18.950782,3.063153,0.60575,0.035157,{},0.645796,0.632068,0.628966,0.633096,0.63666,0.635317,0.005787,1


## 1.3 Essai avec Gradient boosting Classifier

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

In [40]:
gbc=GradientBoostingClassifier(max_features='auto')

In [41]:
grid=GridSearchCV(gbc,{},cv=5, scoring='roc_auc')

In [42]:
result=grid.fit(X,y)

In [43]:
yhat = grid.predict(X)
print("Accuracy: ",accuracy_score(y, yhat))

Accuracy:  0.9192982912598625


In [44]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,28.763265,1.769133,0.092571,0.000723,{},0.643773,0.629634,0.630434,0.633232,0.633352,0.634085,0.005065,1


In [45]:
auroc=roc_auc_score(y,yhat)
auroc

0.5001208702659146

In [46]:
f1_sc=f1_score(y,yhat)
f1_sc

0.00048336421493595425

### Dans les deux cas la précision n'est pas meilleure que si nous avions attribué toutes les targets à 0, vu que le ratio des classes est de 0.08

### Tester à la main l'équilibre des classes (10-30% des données), équivalent d'un under simplang à la main, avec concat.

### Utiliser d'autres métriques et feature engineering du kaggle (voir slack). Tutorials OC (voir slack Damien) : smot, faire pipeline avec smot, randomUndersampler (créé classes équilibrés) comme sampler (transformer les classes), scaler : quantiletransformer (contrebalance déséquilibre). A faire en dernier, hyperparam du modèle : class_weight (logisticregression par ex, et trouver le bonen équilibrage). Tester toutes ces solutions.

### Voir F1 et F2 score

### Tester LogisticRegression, demi regressor classifier, xgboost, lgbm

### choisir 10-15 features, puis kaggle prend toutes les features (onehotencoder), puis virer les corrélés. voir pipeline MLernia

## 1.4 Essai avec un quantileScaler pour contrebalancer les classes déséquilibrées

In [47]:
from sklearn.preprocessing import QuantileTransformer

In [48]:
qt=QuantileTransformer(random_state=0)

In [49]:
X_train_qt=qt.fit_transform(X_train, y_train)

In [50]:
X_train_qt.shape

(153739, 7)

In [51]:
y_train.shape

(153739,)

In [52]:
result=rfc.fit(X_train_qt,y_train)

In [53]:
X_test_qt=qt.transform(X_test)

In [54]:
score=rfc.score(X_test_qt, y_test)
score

0.9192722731382408

In [55]:
yhat=rfc.predict(X_test_qt)
conf_mat=confusion_matrix(y_test,yhat)
conf_mat

array([[141328,      1],
       [ 12410,      0]])

In [56]:
auroc=roc_auc_score(y_test,yhat)
auroc

0.4999964621556793

In [57]:
f1_sc=f1_score(y_test,yhat)
f1_sc

0.0

## 1.5 Essai using SMOTE

In [58]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [59]:
param_dict={'sampling' : [SMOTE(),RandomUnderSampler(), "passthrough"]}

In [60]:
from imblearn.pipeline import Pipeline
pipe = Pipeline([
        ('sampling', SMOTE()),
        ('classification', rfc)
    ])

grid = GridSearchCV(pipe, param_dict, scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)
grid.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Traceback (most recent call last):
  File "/home/emma_sylvain/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/emma_sylvain/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 281, in fit
    self._final_estimator.fit(Xt, yt, **fit_params)
  File "/home/emma_sylvain/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/home/emma_sylvain/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1046, in __call__
    while self.dispatch_one_batch(iterator):
  File "/home/emma_sylvain/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/emma_sylvain/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch,

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('sampling', SMOTE()),
                                       ('classification',
                                        RandomForestClassifier(max_depth=10,
                                                               random_state=0))]),
             n_jobs=-1,
             param_grid={'sampling': [SMOTE(), RandomUnderSampler(),
                                      'passthrough']},
             scoring='roc_auc', verbose=1)

In [61]:
yhat = grid.predict(X)
print("Accuracy: ",accuracy_score(y, yhat))

Accuracy:  0.9192917867294571


In [62]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sampling,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,89.785076,12.85095,0.656311,0.328413,SMOTE(),{'sampling': SMOTE()},,0.586096,0.586599,0.579922,0.594631,,,3
1,4.271603,0.16937,0.899639,0.06966,RandomUnderSampler(),{'sampling': RandomUnderSampler()},0.640945,0.629732,0.631119,0.6314,0.633666,0.633373,0.003991,2
2,33.200475,8.518376,0.801316,0.140915,passthrough,{'sampling': 'passthrough'},0.645796,0.632068,0.628966,0.633096,0.63666,0.635317,0.005787,1


In [63]:
f1_sc=f1_score(y,yhat)
f1_sc

0.00032226877215597806

### La randomforest gère sans scaler, à voir avec logistic classifier (viser 0.7-0.8)

### Essayer avec d'autres scalers robust, standard, quantile, etc Normalizer, MinMax

### Logistic classifier

In [64]:
from sklearn.linear_model import LogisticRegression
lrc=LogisticRegression()

In [65]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [66]:
X=df[select_features]
y=df['TARGET']

In [67]:
param_dict={'sampling' : [SMOTE(),RandomUnderSampler(), "passthrough"], 'scaler' : [RobustScaler(),MinMaxScaler(),
                                                                                   StandardScaler(),
                                                                                   "passthrough"]}

In [68]:
pipe = Pipeline([
        ('sampling', SMOTE()), ('scaler',StandardScaler()), 
        ('classification', lrc)
    ])

grid = GridSearchCV(pipe, param_dict, scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)
grid.fit(X, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('sampling', SMOTE()),
                                       ('scaler', StandardScaler()),
                                       ('classification',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid={'sampling': [SMOTE(), RandomUnderSampler(),
                                      'passthrough'],
                         'scaler': [RobustScaler(), MinMaxScaler(),
                                    StandardScaler(), 'passthrough']},
             scoring='roc_auc', verbose=1)

In [69]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sampling,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.029972,0.129065,0.04039,0.010789,SMOTE(),RobustScaler(),"{'sampling': SMOTE(), 'scaler': RobustScaler()}",0.507548,0.517933,0.51132,0.509644,0.518871,0.513063,0.00453,10
1,3.202274,0.652043,0.035628,0.002147,SMOTE(),MinMaxScaler(),"{'sampling': SMOTE(), 'scaler': MinMaxScaler()}",0.505812,0.516739,0.508679,0.50737,0.516724,0.511065,0.004715,12
2,1.569249,0.121096,0.035016,0.001143,SMOTE(),StandardScaler(),"{'sampling': SMOTE(), 'scaler': StandardScaler()}",0.507463,0.518013,0.510569,0.509944,0.518317,0.512861,0.004455,11
3,1.5644,0.069846,0.031657,0.001667,SMOTE(),passthrough,"{'sampling': SMOTE(), 'scaler': 'passthrough'}",0.586787,0.57746,0.582005,0.5765,0.583294,0.581209,0.003803,7
4,0.264599,0.025564,0.033955,0.00226,RandomUnderSampler(),RobustScaler(),"{'sampling': RandomUnderSampler(), 'scaler': R...",0.60837,0.593206,0.597259,0.597729,0.601446,0.599602,0.005103,4
5,0.290882,0.032695,0.034485,0.001843,RandomUnderSampler(),MinMaxScaler(),"{'sampling': RandomUnderSampler(), 'scaler': M...",0.605223,0.589332,0.593125,0.596135,0.598842,0.596531,0.005375,6
6,0.225703,0.007934,0.03368,0.002181,RandomUnderSampler(),StandardScaler(),"{'sampling': RandomUnderSampler(), 'scaler': S...",0.608012,0.593381,0.597882,0.598366,0.601845,0.599897,0.004868,3
7,0.235161,0.003779,0.032797,0.00128,RandomUnderSampler(),passthrough,"{'sampling': RandomUnderSampler(), 'scaler': '...",0.586626,0.576943,0.581318,0.576247,0.582801,0.580787,0.003842,8
8,0.943821,0.138295,0.03505,0.001783,passthrough,RobustScaler(),"{'sampling': 'passthrough', 'scaler': RobustSc...",0.608045,0.593583,0.597865,0.598682,0.60179,0.599993,0.004804,2
9,1.668019,0.201151,0.035714,0.001722,passthrough,MinMaxScaler(),"{'sampling': 'passthrough', 'scaler': MinMaxSc...",0.607481,0.591505,0.595756,0.596607,0.600354,0.598341,0.005367,5


In [70]:
yhat = grid.predict(X)
f1_sc=f1_score(y,yhat)
f1_sc

8.056719303899451e-05

## 1.6 Equilibrage des classes à la main.

In [71]:
df_1=df[df['TARGET']==1]
len(df_1)

24820

In [72]:
df_0_tot=df[df['TARGET']==0]
df_0=df_0_tot[0:24820]

In [73]:
len(df_0)

24820

In [74]:
df_eq=pd.concat([df_1, df_0])
df_eq

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY
0,1,0,202500.0,406597.5,24700.5,351000.0,2,2
26,1,0,112500.0,979992.0,27076.5,702000.0,3,2
40,1,0,202500.0,1193580.0,35028.0,855000.0,2,2
42,1,0,135000.0,288873.0,16258.5,238500.0,3,3
81,1,0,81000.0,252000.0,14593.5,252000.0,2,2
...,...,...,...,...,...,...,...,...
26966,0,2,270000.0,550080.0,14638.5,360000.0,3,3
26967,0,1,67500.0,263340.0,7371.0,198000.0,2,2
26968,0,0,112500.0,630000.0,31500.0,630000.0,1,1
26970,0,1,292500.0,948816.0,30735.0,792000.0,2,2


### Random Forest Classifier

In [75]:
grid=GridSearchCV(rfc,{},scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)

In [76]:
y_eq=df_eq['TARGET']

In [77]:
X_eq=df_eq[select_features]

In [78]:
grid.fit(X_eq, y_eq)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_depth=10, random_state=0),
             n_jobs=-1, param_grid={}, scoring='roc_auc', verbose=1)

In [79]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.937799,0.724377,0.134238,0.019341,{},0.646418,0.643838,0.626518,0.633929,0.641844,0.638509,0.007305,1


In [81]:
yhat = grid.predict(X_eq)
f1_sc=f1_score(y_eq,yhat)
f1_sc

0.6227805341496886

### Logistic classifier

In [82]:
pipe = Pipeline([
        ('sampling', SMOTE()), ('scaler',StandardScaler()), 
        ('classification', lrc)
    ])

grid = GridSearchCV(pipe, param_dict, scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)
grid.fit(X_eq, y_eq)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('sampling', SMOTE()),
                                       ('scaler', StandardScaler()),
                                       ('classification',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid={'sampling': [SMOTE(), RandomUnderSampler(),
                                      'passthrough'],
                         'scaler': [RobustScaler(), MinMaxScaler(),
                                    StandardScaler(), 'passthrough']},
             scoring='roc_auc', verbose=1)

### scores.sort_values(rank_test_score)

In [83]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sampling,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.222941,0.046706,0.008763,0.000418,SMOTE(),RobustScaler(),"{'sampling': SMOTE(), 'scaler': RobustScaler()}",0.608507,0.599615,0.592734,0.601516,0.605644,0.601603,0.005415,4
1,0.159298,0.023562,0.008476,0.000347,SMOTE(),MinMaxScaler(),"{'sampling': SMOTE(), 'scaler': MinMaxScaler()}",0.606037,0.596971,0.590142,0.597661,0.603006,0.598764,0.005471,7
2,0.127089,0.013507,0.008639,0.000521,SMOTE(),StandardScaler(),"{'sampling': SMOTE(), 'scaler': StandardScaler()}",0.6085,0.599609,0.592749,0.60155,0.605672,0.601616,0.005413,1
3,0.127268,0.013268,0.0077,0.000274,SMOTE(),passthrough,"{'sampling': SMOTE(), 'scaler': 'passthrough'}",0.586521,0.582109,0.579801,0.575846,0.580139,0.580883,0.003475,10
4,0.182624,0.0318,0.008153,0.000285,RandomUnderSampler(),RobustScaler(),"{'sampling': RandomUnderSampler(), 'scaler': R...",0.608507,0.599615,0.592734,0.601516,0.605644,0.601603,0.005415,4
5,0.181777,0.0148,0.008284,0.000154,RandomUnderSampler(),MinMaxScaler(),"{'sampling': RandomUnderSampler(), 'scaler': M...",0.606037,0.596971,0.590142,0.597661,0.603006,0.598764,0.005471,7
6,0.132472,0.012574,0.008986,0.000983,RandomUnderSampler(),StandardScaler(),"{'sampling': RandomUnderSampler(), 'scaler': S...",0.6085,0.599609,0.592749,0.60155,0.605672,0.601616,0.005413,1
7,0.136477,0.015314,0.007796,0.000437,RandomUnderSampler(),passthrough,"{'sampling': RandomUnderSampler(), 'scaler': '...",0.586521,0.582109,0.579801,0.575846,0.580139,0.580883,0.003475,10
8,0.148627,0.023886,0.008085,9.6e-05,passthrough,RobustScaler(),"{'sampling': 'passthrough', 'scaler': RobustSc...",0.608507,0.599615,0.592734,0.601516,0.605644,0.601603,0.005415,4
9,0.141547,0.019761,0.008963,0.001202,passthrough,MinMaxScaler(),"{'sampling': 'passthrough', 'scaler': MinMaxSc...",0.606037,0.596971,0.590142,0.597661,0.603006,0.598764,0.005471,7


In [84]:
yhat = grid.predict(X)
f1_sc=f1_score(y,yhat)
f1_sc

0.17898908484357784

## 1.7 Utilisation de LGBM Classifier

In [119]:
from lightgbm import LGBMClassifier

lgbmc=LGBMClassifier()

clf = lgbmc.fit(X_train, y_train)

In [120]:
#Prediction
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(0,len(y_pred)):
    if y_pred[i]>=.5:       # setting threshold to .5
        y_pred[i]=1
    else:  
        y_pred[i]=0

In [121]:
auroc=roc_auc_score(y_test,y_pred)
auroc

0.4999929243113586

In [122]:
f1_sc=f1_score(y_test,y_pred)
f1_sc

0.0

## 1.8 Dummy classifier

In [89]:
from sklearn.dummy import DummyClassifier

In [90]:
dmc=DummyClassifier()

In [91]:
pipe = Pipeline([
        ('sampling', SMOTE()), ('scaler',StandardScaler()), 
        ('classification', dmc)
    ])

grid = GridSearchCV(pipe, {}, scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)
grid.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('sampling', SMOTE()),
                                       ('scaler', StandardScaler()),
                                       ('classification', DummyClassifier())]),
             n_jobs=-1, param_grid={}, scoring='roc_auc', verbose=1)

In [92]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.709504,0.164889,0.023288,0.006557,{},0.5,0.5,0.5,0.5,0.5,0.5,0.0,1


In [93]:
yhat = grid.predict(X)
f1_sc=f1_score(y,yhat)
f1_sc

0.0

## 1.9 Utilisation du paramètre class_weight

### Random Forest

In [94]:
rfc_cw=RandomForestClassifier(max_depth=10,random_state=0, class_weight="balanced")

In [95]:
grid=GridSearchCV(rfc_cw,{},scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)

In [96]:
result=grid.fit(X,y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [97]:
yhat = grid.predict(X)
f1_sc=f1_score(y,yhat)
f1_sc

0.21722025559961838

In [98]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,41.481909,12.295042,0.834733,0.115325,{},0.64219,0.632161,0.631085,0.632732,0.636323,0.634898,0.004048,1


### Logistic regression classifier

In [99]:
lrc_cw=LogisticRegression(class_weight="balanced")

In [100]:
param_dict={'sampling' : [SMOTE(),RandomUnderSampler(), "passthrough"], 'scaler' : [RobustScaler(),MinMaxScaler(),
                                                                                   StandardScaler(),
                                                                                   "passthrough"]}

In [102]:
pipe = Pipeline([
        ('sampling', SMOTE()), ('scaler',StandardScaler()), 
        ('classification', lrc_cw)
    ])

grid = GridSearchCV(pipe, param_dict, scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)
grid.fit(X, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('sampling', SMOTE()),
                                       ('scaler', StandardScaler()),
                                       ('classification',
                                        LogisticRegression(class_weight='balanced'))]),
             n_jobs=-1,
             param_grid={'sampling': [SMOTE(), RandomUnderSampler(),
                                      'passthrough'],
                         'scaler': [RobustScaler(), MinMaxScaler(),
                                    StandardScaler(), 'passthrough']},
             scoring='roc_auc', verbose=1)

In [103]:
yhat = grid.predict(X)
f1_sc=f1_score(y,yhat)
f1_sc

0.1791409848434144

In [104]:
scores=pd.DataFrame(grid.cv_results_)
scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sampling,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.296024,0.460755,0.03484,0.002142,SMOTE(),RobustScaler(),"{'sampling': SMOTE(), 'scaler': RobustScaler()}",0.508064,0.518648,0.511014,0.509826,0.518253,0.513161,0.004421,10
1,5.341968,0.849628,0.058066,0.026821,SMOTE(),MinMaxScaler(),"{'sampling': SMOTE(), 'scaler': MinMaxScaler()}",0.505761,0.516687,0.509061,0.507339,0.516727,0.511115,0.004684,12
2,1.99157,0.30498,0.043589,0.010761,SMOTE(),StandardScaler(),"{'sampling': SMOTE(), 'scaler': StandardScaler()}",0.508123,0.518372,0.510814,0.509541,0.518094,0.512989,0.004367,11
3,2.184909,0.167063,0.033805,0.004015,SMOTE(),passthrough,"{'sampling': SMOTE(), 'scaler': 'passthrough'}",0.586678,0.577528,0.582037,0.576345,0.583349,0.581187,0.003805,7
4,0.305335,0.041134,0.043255,0.015077,RandomUnderSampler(),RobustScaler(),"{'sampling': RandomUnderSampler(), 'scaler': R...",0.607477,0.593471,0.597859,0.598586,0.601362,0.599751,0.004619,4
5,0.315379,0.03301,0.039625,0.003743,RandomUnderSampler(),MinMaxScaler(),"{'sampling': RandomUnderSampler(), 'scaler': M...",0.604946,0.588939,0.593256,0.59672,0.599378,0.596648,0.005427,6
6,0.240428,0.018496,0.04272,0.013006,RandomUnderSampler(),StandardScaler(),"{'sampling': RandomUnderSampler(), 'scaler': S...",0.608318,0.593519,0.597345,0.598347,0.601965,0.599899,0.004996,1
7,0.249489,0.009168,0.03165,0.001063,RandomUnderSampler(),passthrough,"{'sampling': RandomUnderSampler(), 'scaler': '...",0.586521,0.576981,0.581186,0.575708,0.583115,0.580702,0.003968,8
8,1.088326,0.244485,0.035854,0.003661,passthrough,RobustScaler(),"{'sampling': 'passthrough', 'scaler': RobustSc...",0.607841,0.593411,0.597607,0.598582,0.601583,0.599805,0.004795,3
9,1.859342,0.493732,0.039475,0.004719,passthrough,MinMaxScaler(),"{'sampling': 'passthrough', 'scaler': MinMaxSc...",0.607952,0.592894,0.597008,0.597942,0.601286,0.599416,0.005037,5


### LGBM Classifier

In [115]:
from lightgbm import LGBMClassifier

lgbmc=LGBMClassifier(class_weight='balanced')

clf = lgbmc.fit(X_train, y_train)

In [116]:
#Prediction
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(0,len(y_pred)):
    if y_pred[i]>=.5:       # setting threshold to .5
        y_pred[i]=1
    else:  
        y_pred[i]=0

In [117]:
auroc=roc_auc_score(y_test,y_pred)
auroc

0.5996054593732916

In [118]:
f1_sc=f1_score(y_test,y_pred)
f1_sc

0.19933218054975851

### Rajouter randomundersampler

### Essayer plusieurs sampler, scaler, pour différent modèle, faire df score et garder le meilleur modèle

### Essayer shap

In [124]:
# import shap

In [125]:
# import meilleur modèle dans un fichier avec pickle

In [126]:
# api : renommer notebook en .py, puis utiliser flask