Le but de ce notebook est de tester l'algorithme sur le dataset de test, qui a été réalisé pour la compétition Kaggle

In [14]:
import pandas as pd
import pickle

import pandas as pd
import numpy as np
import missingno
from IPython.display import display, Image
import plotly.express as px

from featuretools import selection

from pycaret.classification import *

# fix pour plotly express et Visual Studio Code
import plotly.io as pio
pio.renderers.default = "notebook_connected"

import matplotlib.pyplot as plt
import seaborn as sns

import re


import plotly.figure_factory as ff
import gc

# Garbage collection is to release memory when the object is no longer in use. This system destroys the unused object and reuses its memory slot for new objects. You can imagine this as a recycling system in computers.



import time
from contextlib import contextmanager

from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
df_test = pd.read_csv('data/application_test.csv')
df_test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,0,,,,,,
48741,456223,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


# Feature engineering

In [16]:
# HOME CREDIT DEFAULT RISK COMPETITION
# Most features are created by applying min, max, mean, sum and var functions to grouped tables. 
# Little feature selection is done and overfitting might be a problem since many features are related.
# The following key ideas were used:
# - Divide or subtract important features to get rates (like annuity and income)
# - In Bureau Data: create specific features for Active credits and Closed credits
# - In Previous Applications: create specific features for Approved and Refused applications
# - Modularity: one function for each table (except bureau_balance and application_test)
# - One-hot encoding for categorical features
# All tables are joined with the application DF using the SK_ID_CURR key (except bureau_balance).
# You can use LightGBM with KFold or Stratified KFold.

# Update 16/06/2018:
# - Added Payment Rate feature
# - Removed index from features
# - Use standard KFold CV (not stratified)


# TARGET value 0 means loan is repayed, value 1 means loan is not repayed.

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    """Cherche toutes les colonnes object du dataframe et applique un get dummies pour les transformer en valeur numérique
    
    Si nan_as_category = True, ajoute une colonne pour indiquer les NaN
    
    Return
    -------
    
    DF transformé
    
    Nom des nouvelles colonnes"""
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Preprocess pour application_train.csv /  application_test.csv
def application_train_test(df, num_rows = None, nan_as_category = False):
    """Read la data des applications train et test et merge
    
    Supprime les lignes avec code gender = XNA
    
    Transforme les variables catégorielles à 2 réponses (exemple M/F) en numérique (0/1)
    
    Corrige des erreurs du DF
    
    Crée de nouvelles features "par personne"
    """

    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Transforme les catégories catégorielles en deux catégories numériques
    # Par exemple, Y/N devient 0/1. M/F devient 0/1
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    # DAYS_EMPLOYED_PERCENT: the percentage of the days employed relative to the client's age
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    # CREDIT_INCOME_PERCENT: the percentage of the credit amount relative to a client's income
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    # ANNUITY_INCOME_PERCENT: the percentage of the loan annuity relative to a client's income
    df['INCOME_CREDIT_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    # PAYMENT_RATE: the length of the payment in months (since the annuity is the monthly amount due
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    
    # Correction (Kevin). Valeurs négatives en valeurs positives
    
    df['DAYS_BIRTH'] = df['DAYS_BIRTH'] * (-1)

    gc.collect()
    return df

# Preprocess pour bureau.csv et bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    """ Transforme les valeurs catégoriques en valeur numérique avec un get_dummies"""
    bureau = pd.read_csv('./data/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('./data/bureau_balance.csv', nrows = num_rows)
    # get_dummies
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Créer des aggrégations et merge avec bureau
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Colonne à gauche, calcul à droite
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Faire la moyenne pour toutes les colonnes onehotencoder
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('./data/previous_application.csv', nrows = num_rows)
    # get_dummies
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    
    # Days 365.243 values -> nan (Correction d'erreur)
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage | Feature engineering
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Colonne à gauche, calcul à droite
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Faire la moyenne pour toutes les colonnes onehotencoder
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    # Pour l'id...
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('./data/POS_CASH_balance.csv', nrows = num_rows)
    # get_dummies
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Colonne à gauche, calcul à droite
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    # faire la moyenne pour toutes les colonnes onehotencoder
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    # pour l'id...
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('./data/installments_payments.csv', nrows = num_rows)
    # get_dummies
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value) | Feature engineering
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values) 
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    # Remplace les valeurs négatives par 0
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    
    # Colonnes à gauche, calcul à droite
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    # faire la moyenne pour toutes les colonnes onehotencoder
    for cat in cat_cols:
        aggregations[cat] = ['mean']
        # pour l'id ...
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('./data/credit_card_balance.csv', nrows = num_rows)
    # get_dummies
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    # Calcul pour toutes les colonnes
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()]) # ?
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg


def feature_engine(df, debug = False, ohe=True, class_weight=None):
    num_rows = 10000 if debug else None
    df = application_train_test(df, num_rows)
    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df shape:", bureau.shape)
        df = df.join(bureau, how='left', on='SK_ID_CURR')
        del bureau
        gc.collect()
    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()
    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()
    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df shape:", ins.shape)
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()
    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df shape:", cc.shape)
        df = df.join(cc, how='left', on='SK_ID_CURR')
        del cc
        gc.collect()
        
    if class_weight==None:
        df.to_csv('test_kaggle/data_de_test_kaggle.csv', index=False)
    else:
        df.to_csv('test_kaggle/data_de_test_kaggle.csv', index=False)
    



In [17]:
feature_engine(df_test)

Bureau df shape: (305811, 116)
Process bureau and bureau_balance - done in 15s
Previous applications df shape: (338857, 249)
Process previous_applications - done in 20s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 10s
Installments payments df shape: (339587, 26)
Process installments payments - done in 23s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 13s


# Import data

In [18]:
df_test = pd.read_csv('test_kaggle/data_de_test_kaggle.csv')
df_test

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100001,0,0,0,0,135000.0,568800.0,20560.5,450000.0,0.018850,...,,,,,,,,,,
1,100005,1,0,0,0,99000.0,222768.0,17370.0,180000.0,0.035792,...,,,,,,,,,,
2,100013,1,1,0,0,202500.0,663264.0,69777.0,630000.0,0.019101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0
3,100028,0,0,0,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
4,100038,1,1,1,1,180000.0,625500.0,32067.0,625500.0,0.010032,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0,0,0,0,121500.0,412560.0,17473.5,270000.0,0.002042,...,,,,,,,,,,
48740,456222,0,0,1,2,157500.0,622413.0,31909.5,495000.0,0.035792,...,,,,,,,,,,
48741,456223,0,1,0,1,202500.0,315000.0,33205.5,315000.0,0.026392,...,,,,,,,,,,
48742,456224,1,0,1,0,225000.0,450000.0,25128.0,450000.0,0.018850,...,,,,,,,,,,


# Selection des features

In [19]:
df_importance = pd.read_csv('feature_importance/feature_importance_avg.csv')
df_importance

Unnamed: 0,feature,importance,importance_normalized,cumulative_importance
0,PAYMENT_RATE,1128.6,0.024368,0.024368
1,EXT_SOURCE_1,1005.5,0.021711,0.046079
2,EXT_SOURCE_3,960.6,0.020741,0.066820
3,EXT_SOURCE_2,913.9,0.019733,0.086553
4,DAYS_BIRTH,718.4,0.015512,0.102064
...,...,...,...,...
789,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,0.0,0.000000,1.000000
790,CC_CNT_DRAWINGS_OTHER_CURRENT_SUM,0.0,0.000000,1.000000
791,POS_NAME_CONTRACT_STATUS_Demand_MEAN,0.0,0.000000,1.000000
792,CC_CNT_DRAWINGS_OTHER_CURRENT_MIN,0.0,0.000000,1.000000


In [20]:
cols = df_importance.sort_values(by=["importance"], ascending=False)['feature'].unique()[:40]
data_features = df_test[cols]



data_features

Unnamed: 0,PAYMENT_RATE,EXT_SOURCE_1,EXT_SOURCE_3,EXT_SOURCE_2,DAYS_BIRTH,AMT_ANNUITY,DAYS_EMPLOYED,APPROVED_CNT_PAYMENT_MEAN,DAYS_ID_PUBLISH,INCOME_CREDIT_PERC,...,ACTIVE_DAYS_CREDIT_UPDATE_MEAN,BURO_DAYS_CREDIT_MAX,BURO_DAYS_CREDIT_ENDDATE_MAX,INSTAL_AMT_PAYMENT_MIN,ACTIVE_DAYS_CREDIT_ENDDATE_MAX,ACTIVE_DAYS_CREDIT_MEAN,INSTAL_DBD_MAX,CLOSED_AMT_CREDIT_SUM_MEAN,BURO_AMT_CREDIT_SUM_DEBT_MEAN,ACTIVE_DAYS_CREDIT_ENDDATE_MEAN
0,0.036147,0.752614,0.159520,0.789654,19241,20560.5,-2329.0,8.000000,-812,0.152300,...,-10.666667,-49.0,1778.0,3951.000,1778.0,-309.333333,36.0,142335.000000,85240.928571,1030.333333
1,0.077973,0.564990,0.432962,0.291656,18064,17370.0,-4469.0,12.000000,-1623,0.175455,...,-21.000000,-62.0,1324.0,4813.200,1324.0,-99.500000,37.0,58500.000000,189469.500000,723.000000
2,0.105202,,0.610991,0.699787,20038,69777.0,-4458.0,17.333333,-3503,0.344578,...,,-1210.0,-567.0,6.165,,,38.0,518070.015000,0.000000,
3,0.031123,0.525734,0.612704,0.509677,13976,49018.5,-1866.0,11.333333,-4208,0.155614,...,-238.800000,-269.0,30885.0,1.170,30885.0,-1050.400000,19.0,93736.285714,18630.450000,7872.750000
4,0.051266,0.202145,,0.425687,13040,32067.0,-2191.0,12.000000,-4262,0.178150,...,,,,11097.450,,,18.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.042354,,0.643026,0.648575,19970,17473.5,-5169.0,24.000000,-3399,0.143815,...,-27.500000,-113.0,793.0,14222.430,793.0,-452.000000,8.0,93622.500000,111294.000000,644.000000
48740,0.051267,,,0.684596,11186,31909.5,-1149.0,17.500000,-3003,0.202600,...,,,,2.700,,,26.0,,,
48741,0.105414,0.733503,0.283712,0.632770,15922,33205.5,-3037.0,11.000000,-1504,0.163978,...,-45.000000,-218.0,-13.0,12640.950,,-229.000000,76.0,526440.375000,4268.700000,
48742,0.055840,0.373090,0.595456,0.445701,13968,25128.0,-2731.0,17.000000,-1364,0.111680,...,-22.500000,-515.0,980.0,5519.925,980.0,-1077.500000,15.0,152999.566364,52511.904000,652.200000


# Chargement de la pipeline

In [21]:
def cout_metier(y_test, pred_test_y):
    poids_tn = 1 # on maximise le nombre de personnes pouvant rembourser son prêt
    poids_fp = 0
    poids_fn = -10 # on veut à tout prix éviter les personnes ne remboursant pas le prêt que l'algorithme n'arrive pas à détecter
    poids_tp = 0
    conf_mat = confusion_matrix(y_test, pred_test_y)
    tn, fp, fn, tp = conf_mat.ravel()
    total = tn+fp+fn+tp
    
    return (tn*poids_tn + fp*poids_fp + fn*poids_fn + tp * poids_tp)/total

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, auc, plot_confusion_matrix, f1_score, fbeta_score, confusion_matrix, classification_report

algo = pickle.load(open('pipeline/pipeline_perso_balanced.pkl', 'rb'))

algo

# Prédictions

In [22]:
df_result = algo.predict_proba(data_features)

data_features['SK_ID_CURR'] = df_test['SK_ID_CURR']

data_features['TARGET'] = df_result[:,1]

In [23]:
data_features = data_features[['SK_ID_CURR', 'TARGET']]
data_features

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.351929
1,100005,0.625553
2,100013,0.139660
3,100028,0.270542
4,100038,0.715743
...,...,...
48739,456221,0.212070
48740,456222,0.565237
48741,456223,0.068093
48742,456224,0.303274


# Sauvegarde des résultats

In [24]:
data_features.to_csv('test_kaggle/results_random_iter50.csv', index=False)

# Résultats

The private leaderboard is calculated with approximately 80% of the test data.

## Pipeline balanced

<center><img src="test_kaggle/score_kaggle_pipeline_balanced.png" width="900" height="300" /></center>

## Pipeline perso :

0,76445 (Private) / 0.76354 (Public)

## Pipeline iter50 :

0,76442 (Private) / 0.76218 (Public)