# Test de plusieur modèles et recherche du meilleur modèle pour interprétation des meilleurs "Features"

## Importation des libraires

In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import warnings
pd.set_option("display.max_rows",100)  
pd.set_option("display.max_columns",None)
warnings.filterwarnings('ignore')

# check version number
import imblearn
print("imblearn version :", imblearn.__version__)

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

imblearn version : 0.8.0


In [2]:
from xgboost import XGBClassifier
import xgboost as xgb

## Création des fonctions

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline

def repartition_dataset(data,target,test_size=0.2,random_state=0,stratify_target=True,under_sampling=False,over_sampling=False):
    if stratify_target:
        train_set,test_set = train_test_split(data,test_size=test_size,random_state=random_state,stratify=data[target])
    else:
        train_set,test_set = train_test_split(data,test_size=test_size,random_state=random_state)
        
    # test set
    X_test = test_set.drop(target,axis=1)
    y_test = test_set[[target]]
    print("Test =>\nX :", X_test.shape, "\ny :",y_test.shape)
    if under_sampling :
        train_set = fair_data(train_set,target)
    X_train = train_set.drop(target,axis=1)
    y_train = train_set[[target]]
    
    if over_sampling :
        over = SMOTE(sampling_strategy=0.1)
        under = RandomUnderSampler(sampling_strategy=0.5)
        steps = [('o', over), ('u', under)]
        pipeline = Pipeline(steps=steps)
        X_train, y_train = pipeline.fit_resample(X_train, y_train)
    
    print("Train =>\nX :", X_train.shape, "\ny :",y_train.shape)
    
    return X_train,X_test,y_train,y_test

# Fonctionne avec des données binaire, méthode de downsample ou undersampling
def fair_data(data,target,random_state=0):
    positif_data = data[data[target]==1]
    negatif_data = data[data[target]==0]
    min_size = min([positif_data.shape[0],negatif_data.shape[0]])
    return pd.concat([positif_data.sample(min_size,random_state=random_state),
                      negatif_data.sample(min_size,random_state=random_state)], axis=0)

In [4]:
from sklearn.metrics import make_scorer, fbeta_score , precision_recall_curve, confusion_matrix, plot_roc_curve
from sklearn.model_selection import learning_curve

ftwo_scorer = make_scorer(fbeta_score, beta=2)

def plot_precision_recall_curve(y_pred_proba,y_test,figsize=(12,10)):
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)  
    plt.figure(figsize=figsize)
    plt.title("Precision-Recall vs Threshold Chart")
    plt.plot(thresholds, precision[: -1], "b--", label="Precision")
    plt.plot(thresholds, recall[: -1], "r--", label="Recall")
    plt.ylabel("Precision, Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower left")
    plt.ylim([0,1])
    plt.show()
    
def plot_learning_curve(model,X_train,y_train,scoring=ftwo_scorer,cv=4,size_train=None):
    if size_train is None:
        N, train_score, val_score = learning_curve(model, X_train, y_train,
                                                   cv=cv, scoring=scoring,
                                                   train_sizes=np.linspace(0.1, 1, 10))
    else:
        N, train_score, val_score = learning_curve(model, X_train[:size_train], y_train[:size_train],
                                                   cv=cv, scoring=scoring,
                                                   train_sizes=np.linspace(0.1, 1, 10))
        
    plt.figure(figsize=(12, 8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()
    plt.show()

def plot_all_roc_curve(all_model,X_test,y_test,figsize=(9,7),title_roc_curve="ROC Curve",naive_model=None):
    fig, ax = plt.subplots(figsize=figsize)
    model_displays = {}
    for name, pipeline in all_model.items():
        model_displays[name] = plot_roc_curve(pipeline, X_test, y_test, ax=ax, name=name)
        
    if not naive_model is None:
        model_displays["Naive"] = plot_roc_curve(naive_model, X_test, y_test, ax=ax, name="Naive")
        
    _ = ax.set_title(title_roc_curve)
        

def evaluation(model,X_test,y_test):
    y_pred = model.predict(X_test)
    print("Matrice confusion :\n",confusion_matrix(y_test,y_pred))
    score = ftwo_scorer(model,X_test,y_test)
    print("Score (fbeta 2):",score )
    return score
   

## Récupération du jeu de données

In [5]:
df_knowledge_domain = pd.read_feather("./final_dataset/knowledge_domain").drop("index",axis=1)
app_train = pd.read_feather("./final_dataset/clean_application_train").drop("index",axis=1)

In [6]:
df_knowledge_domain.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS,BASEMENTAREA,YEARS_BEGINEXPLUATATION,YEARS_BUILD,COMMONAREA,ELEVATORS,ENTRANCES,FLOORSMAX,FLOORSMIN,LANDAREA,LIVINGAPARTMENTS,LIVINGAREA,NONLIVINGAPARTMENTS,NONLIVINGAREA,TOTALAREA,FLAG_WORK,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,STABILITY_WORK,NB_YEAR_CREDIT,REST_TO_LIVE_RATE,REST_TO_LIVE,COLLECTION_CAR
0,1,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637.0,-3648.0,-2120,0.0,1,1,0,1,1,0,1.0,2,2,10,0,0,0,0,0,0,0.083037,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.024967,0.037367,0.9722,0.625867,0.014367,0.0,0.069,0.0833,0.125,0.037367,0.0209,0.019367,0.0,0.0,0.108467,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0.220339,16.461104,0.878022,177799.5,0
1,0,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188.0,-1186.0,-291,0.0,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311267,0.622246,0.535276,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.095033,0.0532,0.9851,0.799567,0.057,0.0802,0.0345,0.2917,0.3333,0.013,0.078333,0.055367,0.0026,0.0066,0.185167,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0.116528,36.234085,0.867783,234301.5,0
2,0,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225.0,-4260.0,-2531,26.0,1,1,1,1,1,0,1.0,2,2,9,0,0,0,0,0,0,0.505892,0.555912,0.729567,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.116453,0.08795,0.977512,0.755932,0.043875,0.077081,0.148042,0.224723,0.230427,0.066145,0.102732,0.107254,0.008471,0.027801,0.333026,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.018035,20.0,0.9,60750.0,0
3,0,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039.0,-9833.0,-2437,0.0,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,0.505892,0.650442,0.535276,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.116453,0.08795,0.977512,0.755932,0.043875,0.077081,0.148042,0.224723,0.230427,0.066145,0.102732,0.107254,0.008471,0.027801,0.333026,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.244391,10.532818,0.7801,105313.5,0
4,0,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038.0,-4311.0,-3458,0.0,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,0.505892,0.322738,0.535276,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.116453,0.08795,0.977512,0.755932,0.043875,0.077081,0.148042,0.224723,0.230427,0.066145,0.102732,0.107254,0.008471,0.027801,0.333026,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.227361,23.461618,0.820037,99634.5,0


In [7]:
round(df_knowledge_domain["TARGET"].value_counts(normalize=True)*100,3)

0    91.925
1     8.075
Name: TARGET, dtype: float64

In [10]:
# X_train,X_test,y_train,y_test = repartition_dataset(new_little_dataset,"TARGET")
X_train,X_test,y_train,y_test = repartition_dataset(df_knowledge_domain,"TARGET")

Test =>
X : (61451, 217) 
y : (61451, 1)
Train =>
X : (245803, 217) 
y : (245803, 1)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

def boruta_feature_selection(trainset, target, verbose=0,sample=None,random_state=0):
    
    # Création de la randomforest pour l'utilisation de boruta
    rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=verbose, random_state=random_state)
    # split train
    if sample!=None:
        trainset = trainset.sample(sample,random_state=random_state)
        
    X = trainset.drop(target,axis=1).values
    feature = np.array(trainset.drop("TARGET",axis=1).columns.tolist())
    y = trainset[["TARGET"]].values
    y = y.ravel()
    feat_selector.fit(X, y)
    
    return feat_selector , feature, X, y

In [12]:
feat_selector, feature, X, y = boruta_feature_selection(df_knowledge_domain,"TARGET")

In [35]:
feature_metier_not_encoded = ["CODE_GENDER",
                              "NAME_HOUSING_TYPE",
                              "NAME_EDUCATION_TYPE",
                              "NAME_FAMILY_STATUS",
                              "OCCUPATION_TYPE",
                              "ORGANIZATION_TYPE",
                              "NAME_INCOME_TYPE",
                              "EMERGENCYSTATE_MODE",
                              "DAYS_BIRTH",
                              "DAYS_EMPLOYED",
                              "OWN_CAR_AGE" ,
                              "EXT_SOURCE_1",
                              "EXT_SOURCE_2",
                              "EXT_SOURCE_3",
                              "DAYS_LAST_PHONE_CHANGE",
                              "AMT_INCOME_TOTAL",
                              "AMT_CREDIT",
                              "AMT_GOODS_PRICE",
                              "AMT_ANNUITY",
                              "DEF_30_CNT_SOCIAL_CIRCLE",
                              "DEF_60_CNT_SOCIAL_CIRCLE",
                              "OBS_30_CNT_SOCIAL_CIRCLE",
                              "OBS_60_CNT_SOCIAL_CIRCLE",
                              "REG_CITY_NOT_LIVE_CITY",
                              "REG_CITY_NOT_WORK_CITY",
                              "LIVE_CITY_NOT_WORK_CITY",
                              "STABILITY_WORK",
                              "NB_YEAR_CREDIT",
                              #"REST_TO_LIVE_RATE",
                              "REST_TO_LIVE", # Pour éviter les doublons...
                              "COLLECTION_CAR",
                              "FLAG_WORK"]

In [36]:
def get_all_feature_encoded(feature_encoded,feature_not_encoded):
    all_col_encoded = []
    for col in feature_not_encoded:
        for col_get in feature_encoded:
            if col_get.startswith(col):
                all_col_encoded.append(col_get)
    return all_col_encoded

In [39]:
feature_metier = get_all_feature_encoded([col for col in X_train.columns],feature_metier_not_encoded)

In [16]:
def transform_boruta(X,feat_selector,feature):
    return X[feature[feat_selector.support_]]

def transform_metier(X,feature_metier):
    return X[feature_metier]

In [40]:
print("Shape metier :",len(feature_metier))
np.array(feature_metier)

Shape metier : 129


array(['CODE_GENDER', 'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_HOUSING_TYPE_Office apartment',
       'NAME_HOUSING_TYPE_Rented apartment',
       'NAME_HOUSING_TYPE_With parents',
       'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Married',
       'NAME_FAMILY_STATUS_Separated',
       'NAME_FAMILY_STATUS_Single / not married',
       'NAME_FAMILY_STATUS_Unknown', 'NAME_FAMILY_STATUS_Widow',
       'OCCUPATION_TYPE_Accountants', 'OCCUPATION_TYPE_Cleaning staff',
       'OCCUPATION_TYPE_Cooking staff', 'OCCUPATION_TYPE_Core staff',
       'OCCUPATION_TYPE_Drivers', 'OCCUPATION_TYPE_HR staff',
       'OCCUPATION_TYPE_High skill te

In [18]:
print("Shape boruta :",len(feature[feat_selector.support_]))
feature[feat_selector.support_]

Shape boruta : 67


array(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'FLAG_EMP_PHONE', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_6', 'APARTMENTS', 'BASEMENTAREA',
       'YEARS_BEGINEXPLUATATION', 'YEARS_BUILD', 'COMMONAREA',
       'ELEVATORS', 'ENTRANCES', 'FLOORSMAX', 'FLOORSMIN', 'LANDAREA',
       'LIVINGAPARTMENTS', 'LIVINGAREA', 'NONLIVINGAPARTMENTS',
       'NONLIVINGAREA', 'TOTALAREA', 'FLAG_WORK',
       'NAME_INCOME_TYPE_Pensioner', 'NAME_INCOME_TYPE_State 

# Baseline NAIVE (répartition uniforme aléatoire)

In [11]:
## NAIVE
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="uniform",random_state=42)

## Création d'un transformateur personnalisé

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Permet de diminuer le temps de traitement en appliquant le polynomial feature seulement sur les n_best
class Personal_transformer(BaseEstimator,TransformerMixin):

    def __init__(self,degree=2,include_biais=False,n_best=10,select_method=f_classif):
        self.degree=degree
        self.include_biais=include_biais
        self.n_best=n_best
        self.select_method = select_method
        self.polynomial_transformer=PolynomialFeatures(degree, include_bias=include_biais)
        self.selector = SelectKBest(select_method,k=n_best)
        self.feature_transform_ = []
        self.feature_selected_= []
        self.other_feature_ = []

    def fit(self,X,y=None):
        self.selector.fit(X,y)
        self.feature_selected_ = X.columns[self.selector.get_support()]
        self.other_feature_ = [col for col in X.columns if not col in self.feature_selected_]
        self.polynomial_transformer.fit(X[self.feature_selected_])
        self.feature_transform_ = self.polynomial_transformer.get_feature_names(self.feature_selected_)
        return self
        
    def transform(self,X,y=None):
        X_poly = self.polynomial_transformer.transform(X[self.feature_selected_])
        new_X = np.hstack([X_poly, X[self.other_feature_]])
        return new_X
    
    def fit_transform(self,X,y=None):
        self.selector.fit(X,y)
        self.feature_selected_ = X.columns[self.selector.get_support()]
        self.other_feature_ = [col for col in X.columns if not col in self.feature_selected_]
        X_poly = self.polynomial_transformer.fit_transform(X[self.feature_selected_])
        self.feature_transform_ = self.polynomial_transformer.get_feature_names(self.feature_selected_)
        new_X = np.hstack([X_poly, X[self.other_feature_]])
        return new_X
    
    def get_new_feature(self):
        new_feature = self.feature_transform_.copy()
        new_feature.extend(self.other_feature_.copy())
        return new_feature

# Création du tableau d'évaluation

In [13]:
from sklearn.metrics import recall_score, precision_score, accuracy_score, classification_report, f1_score
from sklearn.model_selection import cross_val_score

def grid_eval(all_model,X_test,y_test,naive=None,name_title=None,time_to_run_model=None,rounded=3):
    tab_row = ["recall","precision (label 1)","accuracy","fbeta_2-score"]
    if not time_to_run_model is None: 
        tab_row.append("Temps_d'entrainement")
    tab_col = []
    for key in all_model.keys():
        tab_col.append(key)
    
    if not naive is None:
        tab_col.append("Naive")
    
    all_result = pd.DataFrame(index=tab_row,columns=tab_col) 
    
    for key,model in all_model.items():
        # fill col
        all_score = [round(v,rounded) for v in compute_all_score(model,X_test,y_test)]
        if not time_to_run_model is None: 
            all_score.append(str(round(time_to_run_model[key]/60,rounded)) + "m")
        all_result[key] = np.array(all_score)
        
    if not naive is None:
        all_score = [round(v,rounded) for v in compute_all_score(naive,X_test,y_test)]
        all_score.append(None)
        all_result["Naive"] = np.array(all_score)
        
    if not name_title is None:
        print(name_title)
        
    return all_result

def compute_all_score(model,X_test,y_test):
    y_pred = model.predict(X_test)
    
    recall = recall_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    fbeta = ftwo_scorer(model,X_test,y_test)
    
    return recall, precision, accuracy, fbeta

def grid_eval_cv(all_model,X_train,y_train,naive=None,name_title=None,time_to_run_model=None,rounded=3,cv=5):
    tab_row = ["recall","precision","accuracy","fbeta_2-score"]
    if not time_to_run_model is None: 
        tab_row.append("Temps_d'entrainement")
    tab_col = []
    for key in all_model.keys():
        tab_col.append(key)
    
    if not naive is None:
        tab_col.append("Naive")
    
    all_result = pd.DataFrame(index=tab_row,columns=tab_col) 
    
    for key,model in all_model.items():
        # fill col
        all_score = [round(v,rounded) for v in compute_all_score_cv(model,X_test,y_test,cv)]
        if not time_to_run_model is None: 
            all_score.append(str(round(time_to_run_model[key]/60,rounded)) + "m")
        all_result[key] = np.array(all_score)
        
    if not naive is None:
        all_score = [round(v,rounded) for v in compute_all_score_cv(naive,X_test,y_test,cv)]
        all_score.append(None)
        all_result["Naive"] = np.array(all_score)
        
    if not name_title is None:
        print(name_title)
        
    return all_result

def compute_all_score_cv(model,X_train,y_train,cv):
    cross_val_score(model,X_train,y_train,cv=cv,scoring='recall')
    
    recall = cross_val_score(model,X_train,y_train,cv=cv,scoring='recall')
    precision = cross_val_score(model,X_train,y_train,cv=cv,scoring='average_precision')
    accuracy = cross_val_score(model,X_train,y_train,cv=cv,scoring='accuracy')
    fbeta = cross_val_score(model,X_train,y_train,cv=cv,scoring=ftwo_scorer)
    
    return recall, precision, accuracy, fbeta

# List des tests :

- **Sans over_sampling (car trop aléatoire dans notres contexte) et sans under_sampling ( car trop peu de données )  :**<br><br>
    - jeu de données initial<br>
            1) sans preprocessing (sauf standartisation)
            2) avec preprocessing perso
    - jeu de données boruta<br>
            3) sans preprocessing (sauf standartisation)
    - jeu de données métier<br>
            4) sans preprocessing (sauf standartisation)

On crée donc 2 pipeline avec et sans preprocessing pour les différents Tests

In [28]:
from sklearn.pipeline import make_pipeline

# preprocessor = make_pipeline(Personal_transformer())
preprocessor = make_pipeline(SelectKBest(),PolynomialFeatures())


from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Sans preprocessing
RandomForest_without_prep = make_pipeline(RandomForestClassifier(random_state=0,n_jobs=50))
AdaBoost_without_prep = make_pipeline(AdaBoostClassifier(random_state=0))
SVM_without_prep = make_pipeline(StandardScaler(), SVC(random_state=0,probability=True)) # Trop compliqué dans le contexte global (avec tous le jeu de données)
Logistic_regression_without_prep = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
KNN_without_prep = make_pipeline(StandardScaler(), KNeighborsClassifier())

Xgboost_without_prep = make_pipeline(XGBClassifier(n_jobs=50,random_state=0))

# Avec prerpocessing
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0,n_jobs=50))
AdaBoost= make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0,probability=True)) # Trop compliqué dans le contexte global (avec tous le jeu de données)
Logistic_regression = make_pipeline(preprocessor, StandardScaler(), LogisticRegression(random_state=0))
KNN = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())

Xgboost = make_pipeline(preprocessor,XGBClassifier(n_jobs=50,random_state=0))




dic_model_without_prep ={
    "RandomForest_no_prep":RandomForest_without_prep,
    "AdaBoost_no_prep":AdaBoost_without_prep,
    "SVM_no_prep":SVM_without_prep,
    "LogisticRegression_no_prep":Logistic_regression_without_prep,
    "KNN_no_prep":KNN_without_prep,
    "Xgboost_without_prep":Xgboost_without_prep
}

dic_model ={
    "RandomForest":RandomForest,
    "AdaBoost":AdaBoost,
    "SVM":SVM,
    "LogisticRegression":Logistic_regression,
    "KNN":KNN,
    "Xgboost":Xgboost
}

In [30]:
# All param
dic_params = {
    "RandomForest":{
        "randomforestclassifier__class_weight":['balanced'],
        "randomforestclassifier__max_depth": [5,10,20,50],
        "randomforestclassifier__n_estimators":[50,100,150,200,250,300],
        "randomforestclassifier__criterion":["gini","entropy"],
        
        "pipeline__polynomialfeatures__degree":[2],
        "pipeline__selectkbest__k":[5,10,20]
#         "pipeline__personal_transformer__degree":[2,3],
#         "pipeline__personal_transformer__n_best":[5,10,20]
    },
    "AdaBoost":{
        "adaboostclassifier__n_estimators":[50,100,150,200],
        "adaboostclassifier__learning_rate": [0.5,1.0,1.5,2.0],
        
        "pipeline__polynomialfeatures__degree":[2],
        "pipeline__selectkbest__k":[5,10,20]
#         "pipeline__personal_transformer__degree":[2,3],
#         "pipeline__personal_transformer__n_best":[5,10,20]
    },
    "SVM":{
        "svc__C":np.logspace(-4,0,10),
        "svc__gamma":np.logspace(-2,0,10),
        
        "pipeline__polynomialfeatures__degree":[2,3],
        "pipeline__selectkbest__k":[5,10,20]
#         "pipeline__personal_transformer__degree":[2,3],
#         "pipeline__personal_transformer__n_best":[5,10,20]
    },
    "LogisticRegression":{
        "logisticregression__C":np.logspace(-4, 0, 10),
        "logisticregression__class_weight":['balanced'],
        "logisticregression__penalty":["l1","l2"],
        
        "pipeline__polynomialfeatures__degree":[2],
        "pipeline__selectkbest__k":[5,10,20]
#         "pipeline__personal_transformer__degree":[2,3],
#         "pipeline__personal_transformer__n_best":[5,10,20]
    },
    "KNN":{
        "kneighborsclassifier__n_neighbors":np.arange(3, 23,2),
        
        "pipeline__polynomialfeatures__degree":[2],
        "pipeline__selectkbest__k":[5,10,20]
#         "pipeline__personal_transformer__degree":[2,3],
#         "pipeline__personal_transformer__n_best":[5,10,20]
    },
    "Xgboost":{
        "xgbclassifier__max_depth":[3,5,6,10],
        "xgbclassifier__gamma":[0,10],
        "xgbclassifier__n_estimators": [100,150,200],
        "xgbclassifier__scale_pos_weight":[1,10,20],
        "pipeline__polynomialfeatures__degree":[2],
        "pipeline__selectkbest__k":[5,10,20]
#         "pipeline__personal_transformer__degree":[2,3],
#         "pipeline__personal_transformer__n_best":[5,10,20]
    }
}

dic_params_without_prep = {
    "RandomForest_no_prep":{
        "randomforestclassifier__class_weight":['balanced'],
        "randomforestclassifier__max_depth": [5,10,20,50],
        "randomforestclassifier__n_estimators":[50,100,150,200,250,300],
        "randomforestclassifier__criterion":["gini","entropy"]
    },
    "AdaBoost_no_prep":{
        "adaboostclassifier__n_estimators":[50,100,150,200],
        "adaboostclassifier__learning_rate": [0.5,1.0,1.5,2.0],
    },
    "SVM_no_prep":{
        "svc__C":np.logspace(-4,0,10),
        "svc__gamma":np.logspace(-2,0,10)
    },
    "LogisticRegression_no_prep":{
        "logisticregression__C":np.logspace(-4, 0, 10),
        "logisticregression__class_weight":['balanced'],
        "logisticregression__penalty":["l1","l2"]
    },
    "KNN_no_prep":{
        "kneighborsclassifier__n_neighbors":np.arange(3, 20,2)
    },
    "Xgboost_without_prep":{
        "xgbclassifier__max_depth":[3,5,6,10],
        "xgbclassifier__gamma":[0,10],
        "xgbclassifier__n_estimators": [100,150,200],
        "xgbclassifier__scale_pos_weight":[1,10,20]
    }
}

# Actionneur
model_to_run = {
    "RandomForest":True,
    "AdaBoost":True,
    "SVM":False, # Pas adaptée sur des gros dataset...
    "LogisticRegression":True,
    "KNN":True,
    "Xgboost":True
    
}

model_to_run_without_prep = {
    "RandomForest_no_prep":True,
    "AdaBoost_no_prep":True,
    "SVM_no_prep":False, # Pas adaptée sur des gros dataset...
    "LogisticRegression_no_prep":True,
    "KNN_no_prep":True,
    "Xgboost_without_prep":True
}

In [16]:
from sklearn.model_selection import GridSearchCV
import time

def run_all_model(dic_model, dic_params, model_to_run,
                  X_train, X_test, y_train, y_test,
                  scoring=ftwo_scorer,cv=4,n_jobs_grid=20,
                 plot_learning_curve_model=False,plot_all_roc_curve_model=False,print_best_params=True, print_best_score=True):
    
    dic_best_model = {}
    time_to_run = {}
    
    for name, model in dic_model.items():
        if model_to_run[name]:
            if print_best_params or print_best_score:
                print(name)

            grid = GridSearchCV(model,dic_params[name],scoring=scoring,cv=cv,n_jobs=n_jobs_grid)
            t0 = time.time()
            grid.fit(X_train,y_train)
            time_learn = time.time() - t0
            time_to_run[name] = time_learn
            if print_best_params :
                print("Best params :\n",grid.best_params_)

            if print_best_score:
                print("Best score CV :",grid.best_score_)

            dic_best_model[name] = grid.best_estimator_
            if plot_learning_curve_model:
                plot_learning_curve(grid.best_estimator_,X_train,y_train)
    
    if plot_all_roc_curve_model:
        plot_all_roc_curve(dic_best_model,X_test,y_test)
        
    return dic_best_model, time_to_run

# Début des tests

## Jeu de données initial sans preprocessing (sauf standartisation)

In [33]:
%%time

dic_best_model_1, time_to_run_1 = run_all_model(dic_model_without_prep, dic_params_without_prep, model_to_run_without_prep,
                               X_train, X_test, y_train, y_test)

RandomForest_no_prep
Best params :
 {'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__n_estimators': 250}
Best score CV : 0.3997477175552193
AdaBoost_no_prep
Best params :
 {'adaboostclassifier__learning_rate': 2.0, 'adaboostclassifier__n_estimators': 150}
Best score CV : 0.26400163419447625
LogisticRegression_no_prep
Best params :
 {'logisticregression__C': 0.005994842503189409, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l2'}
Best score CV : 0.4121548276333472
KNN_no_prep
Best params :
 {'kneighborsclassifier__n_neighbors': 3}
Best score CV : 0.04764561913170012
Xgboost_without_prep
Best params :
 {'xgbclassifier__gamma': 10, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 200, 'xgbclassifier__scale_pos_weight': 10}
Best score CV : 0.42547436029761193
Wall time: 1h 6min 14s


In [34]:
dummy_clf.fit(X_train,y_train)

grid_eval(dic_best_model_1,X_test,y_test,naive=dummy_clf,
          name_title="Jeu de données initial sans preprocessing",time_to_run_model=time_to_run_1)

Jeu de donnée initial sans preprocessing


Unnamed: 0,RandomForest_no_prep,AdaBoost_no_prep,LogisticRegression_no_prep,KNN_no_prep,Xgboost_without_prep,Naive
recall,0.617,0.0,0.671,0.039,0.635,0.498
precision (label 1),0.164,0.0,0.159,0.141,0.183,0.081
accuracy,0.715,0.919,0.687,0.903,0.742,0.5
fbeta_2-score,0.397,0.0,0.408,0.045,0.426,0.245
Temps_d'entrainement,10.577m,10.06m,0.947m,17.649m,27.015m,


## Jeu de données initial avec preprocessing

In [31]:
%%time

dic_best_model_2 ,time_to_run_2 = run_all_model(dic_model, dic_params, model_to_run,
                               X_train, X_test, y_train, y_test)

RandomForest
Best params :
 {'pipeline__polynomialfeatures__degree': 2, 'pipeline__selectkbest__k': 20, 'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__n_estimators': 200}
Best score CV : 0.40059804387670517
AdaBoost
Best params :
 {'adaboostclassifier__learning_rate': 2.0, 'adaboostclassifier__n_estimators': 200, 'pipeline__polynomialfeatures__degree': 2, 'pipeline__selectkbest__k': 5}
Best score CV : 0.22007642597070912
LogisticRegression
Best params :
 {'logisticregression__C': 0.005994842503189409, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l2', 'pipeline__polynomialfeatures__degree': 2, 'pipeline__selectkbest__k': 20}
Best score CV : 0.40496678720363644
KNN
Best params :
 {'kneighborsclassifier__n_neighbors': 3, 'pipeline__polynomialfeatures__degree': 2, 'pipeline__selectkbest__k': 10}
Best score CV : 0.0865382487051069
Xgboost


In [32]:
dummy_clf.fit(X_train,y_train)

grid_eval(dic_best_model_2,X_test,y_test,naive=dummy_clf,
          name_title="Jeu de données initial avec preprocessing",time_to_run_model=time_to_run_2)

Jeu de donnée initial avec preprocessing


Unnamed: 0,RandomForest,AdaBoost,LogisticRegression,KNN,Xgboost,Naive
recall,0.623,0.652,0.673,0.077,0.619,0.498
precision (label 1),0.165,0.061,0.153,0.207,0.168,0.081
accuracy,0.715,0.164,0.673,0.902,0.721,0.5
fbeta_2-score,0.401,0.222,0.401,0.088,0.402,0.245
Temps_d'entrainement,29.245m,37.313m,2.183m,53.113m,69.925m,


## Jeu de données Boruta sans preprocessing

In [31]:
%%time

dic_best_model_3,time_to_run_3 = run_all_model(dic_model_without_prep, dic_params_without_prep, model_to_run_without_prep,
                               transform_boruta(X_train,feat_selector,feature), transform_boruta(X_test,feat_selector,feature),
                               y_train, y_test)

RandomForest_no_prep
Best params :
 {'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__n_estimators': 250}
Best score CV : 0.40450886346302584
AdaBoost_no_prep
Best params :
 {'adaboostclassifier__learning_rate': 2.0, 'adaboostclassifier__n_estimators': 150}
Best score CV : 0.26400163419447625
LogisticRegression_no_prep
Best params :
 {'logisticregression__C': 0.12915496650148828, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l2'}
Best score CV : 0.410009628046132
KNN_no_prep
Best params :
 {'kneighborsclassifier__n_neighbors': 3}
Best score CV : 0.06101653577773626
Xgboost_without_prep
Best params :
 {'xgbclassifier__gamma': 0, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 150, 'xgbclassifier__scale_pos_weight': 10}
Best score CV : 0.4269468735238497
Wall time: 40min 23s


In [32]:
dummy_clf.fit(transform_boruta(X_train,feat_selector,feature),y_train)

grid_eval(dic_best_model_3,transform_boruta(X_test,feat_selector,feature),y_test,naive=dummy_clf,
          name_title="Jeu de données boruta sans preprocessing",time_to_run_model=time_to_run_3)

Jeu de donnée boruta sans preprocessing


Unnamed: 0,RandomForest_no_prep,AdaBoost_no_prep,LogisticRegression_no_prep,KNN_no_prep,Xgboost_without_prep,Naive
recall,0.627,0.0,0.666,0.049,0.636,0.498
precision (label 1),0.171,0.0,0.157,0.169,0.181,0.081
accuracy,0.724,0.919,0.685,0.904,0.738,0.5
fbeta_2-score,0.409,0.0,0.404,0.057,0.423,0.245
Temps_d'entrainement,6.607m,5.827m,0.403m,15.561m,11.998m,


## Jeu de données métier sans preprocessing

In [72]:
%%time

dic_best_model_5,time_to_run_5 = run_all_model(dic_model_without_prep, dic_params_without_prep, model_to_run_without_prep,
                               transform_metier(X_train,feature_metier), transform_metier(X_test,feature_metier),
                               y_train, y_test)

RandomForest_no_prep
Best params :
 {'randomforestclassifier__class_weight': 'balanced', 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__n_estimators': 300}
Best score CV : 0.4049309614609038
AdaBoost_no_prep
Best params :
 {'adaboostclassifier__learning_rate': 2.0, 'adaboostclassifier__n_estimators': 150}
Best score CV : 0.26400163419447625
LogisticRegression_no_prep
Best params :
 {'logisticregression__C': 0.046415888336127774, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l2'}
Best score CV : 0.40855637901755226
KNN_no_prep
Best params :
 {'kneighborsclassifier__n_neighbors': 3}
Best score CV : 0.054278128451115876
Xgboost_without_prep
Best params :
 {'xgbclassifier__gamma': 0, 'xgbclassifier__max_depth': 3, 'xgbclassifier__n_estimators': 150, 'xgbclassifier__scale_pos_weight': 10}
Best score CV : 0.4273483798647609
Wall time: 47min 7s


In [73]:
dummy_clf.fit(transform_metier(X_train,feature_metier),y_train)

grid_eval(dic_best_model_5,transform_metier(X_test,feature_metier),y_test,naive=dummy_clf,
          name_title="Jeu de données métier sans preprocessing",time_to_run_model=time_to_run_5)

Jeu de donnée boruta avec preprocessing


Unnamed: 0,RandomForest_no_prep,AdaBoost_no_prep,LogisticRegression_no_prep,KNN_no_prep,Xgboost_without_prep,Naive
recall,0.633,0.0,0.668,0.048,0.637,0.498
precision (label 1),0.166,0.0,0.157,0.178,0.181,0.081
accuracy,0.713,0.919,0.685,0.905,0.738,0.5
fbeta_2-score,0.405,0.0,0.405,0.056,0.424,0.245
Temps_d'entrainement,8.014m,6.038m,0.579m,15.659m,16.841m,


# Synthèse 

On peut déjà éliminer la feature Selection avec le "SelectKBest()" et le "PolynomialFeature()" qui est trop gourmand en ressource et ne possèdent pas de score extraordinaire, avec le jeu de données initial sans preprocessing, le score est assez intéressant pour le XGboost cependant.

Sur l'ensemble des résultats, le modèle qui semble le plus pertinent est le RandomForest avec la feature_selection Boruta avec un Score $F_{\beta 2}$ Cross Validation de 0.4045 et un Score sur le jeu de données de test de 0.408, le XGBoost semble également être intéressant notamment avec le jeu de données métier avec un score de 0.424.

Pour la suite on prendra donc comme modèle gagnant le RandomForest car XGBoost a été implémenté seulement après.