# import + method

In [1]:
# imports du projet
import sys, time, gc
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# Computation

In [2]:
raw_data = pd.read_csv('input_jointure.csv', index_col=0)
raw_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Other,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Other,EMERGENCYSTATE_MODE_Yes
0,100002,1,0,1,1,0,202500.0,406597.5,24700.5,0.018801,...,0,0,0,0,0,1,0,1,0,0
1,100003,0,0,0,0,0,270000.0,1293502.5,35698.5,0.003541,...,0,0,0,0,0,0,0,1,0,0
2,100004,0,1,1,1,0,67500.0,135000.0,6750.0,0.010032,...,0,0,1,0,0,0,0,0,1,0
3,100006,0,0,0,1,0,135000.0,312682.5,29686.5,0.008019,...,0,0,1,0,0,0,0,0,1,0
4,100007,0,0,1,1,0,121500.0,513000.0,21865.5,0.028663,...,0,0,1,0,0,0,0,0,1,0


In [3]:
selected_features = [
    'EXT_SOURCE_2', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'AMT_ANNUITY', 'AMT_CREDIT', 'Y_ID_PUBLISH', 'Y_BIRTH', 
    'Y_EMPLOYED', 'Y_REGISTRATION', 'AMT_INCOME_TOTAL', 'CODE_GENDER', 'NAME_EDUCATION_TYPE_Higher education',
    'SK_ID_CURR', 'TARGET'
]
data = raw_data.loc[:, selected_features]

## load other data

In [4]:
pos_cash_data = pd.read_csv('input/POS_CASH_balance.csv')
pos_cash_data.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [10]:
pos_cash_data['SK_DPD_DEF'].describe()

count    1.000136e+07
mean     6.544684e-01
std      3.276249e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.595000e+03
Name: SK_DPD_DEF, dtype: float64

In [6]:
credit_card_balance_data = pd.read_csv('input/credit_card_balance.csv')

In [7]:
credit_card_balance_data.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [9]:
credit_card_balance_data['CNT_DRAWINGS_ATM_CURRENT'].describe()

count    3.090496e+06
mean     3.094490e-01
std      1.100401e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.100000e+01
Name: CNT_DRAWINGS_ATM_CURRENT, dtype: float64

# Other methods

In [25]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [26]:
# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('input/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

In [27]:
# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('input/credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

# Load more data

In [35]:
pos = pos_cash()
print("Pos-cash balance df shape:", pos.shape)
data = data.join(pos, how='left', on='SK_ID_CURR')
del pos
gc.collect()

Pos-cash balance df shape: (337252, 18)


0

In [36]:
cc = credit_card_balance()
print("Credit card balance df shape:", cc.shape)
data = data.join(cc, how='left', on='SK_ID_CURR')
del cc
gc.collect()

Credit card balance df shape: (103558, 141)


0

In [39]:
# méthode pour déterminer les corrélations entre la target et les autres variables
def get_corr_target_var(df, target_label='TARGET', threshold=0.04, top=10):
    result = []
    print("Calcul des corrélations entre la target et les autres variables\n")
    corr_tot = df.corr()[target_label].sort_values(ascending=False)
    corr_tot = pd.DataFrame(corr_tot)
    
    # top des corrélations positives
    corr_pos = corr_tot.loc[(corr_tot[target_label] >= threshold) & (corr_tot[target_label] != 1), :]
    print("Il y a", corr_pos.shape[0], "variables participant positivement à la target. Voici le top", min(top, corr_pos.shape[0]), ":")
    display(corr_pos.head(top))
    
    # top des corrélations négatives
    corr_neg = corr_tot.loc[corr_tot[target_label] <= -threshold, :].sort_values(by=target_label, ascending=True)
    print("Il y a", corr_neg.shape[0], "variables participant négativement à la target. Voici le top", min(top, corr_neg.shape[0]), ":")
    display(corr_neg.head(top))
    
    # concaténation pour resultat final
    corr_neg[target_label] = -corr_neg[target_label]
    concat_list = pd.concat([corr_pos, corr_neg]).sort_values(by=target_label, ascending=False)
    print("Et voici le top", min(top, concat_list.shape[0]), "de la liste complète :")
    display(concat_list.head(min(top, concat_list.shape[0])))
    result = concat_list.index.tolist()
    return result[:top]

In [40]:
col_corr_target = get_corr_target_var(data, top=15)

Calcul des corrélations entre la target et les autres variables

Il y a 33 variables participant positivement à la target. Voici le top 15 :


Unnamed: 0,TARGET
CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,0.107692
CC_CNT_DRAWINGS_CURRENT_MAX,0.101389
CC_AMT_BALANCE_MEAN,0.087177
CC_AMT_TOTAL_RECEIVABLE_MEAN,0.08649
CC_AMT_RECIVABLE_MEAN,0.086478
CC_AMT_RECEIVABLE_PRINCIPAL_MEAN,0.086062
CC_CNT_DRAWINGS_CURRENT_MEAN,0.08252
CC_AMT_INST_MIN_REGULARITY_MEAN,0.073724
CC_CNT_DRAWINGS_POS_CURRENT_MAX,0.068942
CC_AMT_BALANCE_MAX,0.068798


Il y a 13 variables participant négativement à la target. Voici le top 13 :


Unnamed: 0,TARGET
EXT_SOURCE_3,-0.178926
EXT_SOURCE_2,-0.160471
EXT_SOURCE_1,-0.155317
Y_BIRTH,-0.078242
CC_COUNT,-0.060481
CC_NAME_CONTRACT_STATUS_Active_SUM,-0.059376
CC_MONTHS_BALANCE_VAR,-0.058817
NAME_EDUCATION_TYPE_Higher education,-0.056593
Y_ID_PUBLISH,-0.051457
Y_EMPLOYED,-0.04605


Et voici le top 15 de la liste complète :


Unnamed: 0,TARGET
EXT_SOURCE_3,0.178926
EXT_SOURCE_2,0.160471
EXT_SOURCE_1,0.155317
CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,0.107692
CC_CNT_DRAWINGS_CURRENT_MAX,0.101389
CC_AMT_BALANCE_MEAN,0.087177
CC_AMT_TOTAL_RECEIVABLE_MEAN,0.08649
CC_AMT_RECIVABLE_MEAN,0.086478
CC_AMT_RECEIVABLE_PRINCIPAL_MEAN,0.086062
CC_CNT_DRAWINGS_CURRENT_MEAN,0.08252
