In [1]:
import pandas as pd
import numpy as np

In [2]:
# On charge nos deux jeux de données
df_test = pd.read_csv('application_test.csv')
df = pd.read_csv('application_train.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
ccb = pd.read_csv('credit_card_balance.csv')
install_pay = pd.read_csv('installments_payments.csv')
pos_cash = pd.read_csv('POS_CASH_balance.csv')
prev_app = pd.read_csv('previous_application.csv')

# Fonctions

In [3]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [4]:
# Remplissage des valeurs manquantes pour les aggrégations
def data_fill_nan(data):
    
    for col in data.columns:
        if 'BURO' in col:
            data[col].fillna(0, inplace=True)

        elif 'ACTIVE' in col:
            data[col].fillna(0, inplace=True)

        elif 'CLOSED' in col:
            data[col].fillna(0, inplace=True)

        elif 'EXT_SOURCE' in col:
            data[col].fillna(data[col].median(), inplace=True)

        elif 'PREV' in col:
            data[col].fillna(data[col].median(), inplace=True)

        elif 'REFUSED' in col:
            data[col].fillna(0, inplace=True)

        elif 'APPROVED' in col:
            data[col].fillna(0, inplace=True)

        elif 'POS' in col:
            data[col].fillna(data[col].median(), inplace=True)

        elif 'CC' in col:
            data[col].fillna(0, inplace=True)

        elif 'INSTAL' in col:
            data[col].fillna(data[col].median(), inplace=True)

In [5]:
# Pour les valeurs infinies
def del_inf(data, drop=False):

    print('Shape avant :', data.shape, '\n')
    if drop == True:
        for col in data.columns.drop('TARGET'):
            data[col].replace([np.inf, -np.inf], np.nan, inplace=True)
            data[col].dropna(inplace=True)
    else:
        for col in data.columns:
            data[col].replace([np.inf, -np.inf], np.nan, inplace=True)
            data[col].dropna(inplace=True)

    print('Shape après :', data.shape, '\n')

In [6]:
# On supprime les avertissements nous indiquant que l'on change les valeurs de notre jeu de données d'origine
pd.options.mode.chained_assignment = None

Maintenant, nous allons réaliser un preprocessing sur tous nos jeux de données avant de les joindre.

## Application (1/6)

In [7]:
df.shape

(307511, 122)

In [8]:
print("Taille initiale du jeu de données :", df.shape)

# On supprime quelques colonnes
df_cols_drop = ['REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION','DAYS_ID_PUBLISH',
                'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
                'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
                'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
                'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
                'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
                'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG',
                'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
                'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE',
                'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE',
                'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
                'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
                'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',
                'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE',
                'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
                'NAME_TYPE_SUITE', 'CNT_CHILDREN', 'AMT_GOODS_PRICE','NAME_FAMILY_STATUS', 'OWN_CAR_AGE',
                'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
                'HOUR_APPR_PROCESS_START', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
                'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
                'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
                'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

# On supprime les colonnes qui ne nous intéressent pas
df = df.drop(columns = df_cols_drop)
df_test = df_test.drop(columns = df_cols_drop)

# Taille des jeux de données après les changements
print("\nTaille actuelle du jeu de données :", df.shape)

Taille initiale du jeu de données : (307511, 122)

Taille actuelle du jeu de données : (307511, 41)


On joint directement nos jeux principaux d'entraînement et de test. Cela permet de faire le même preprocessing sur les deux jeux de données. Les jeux seront redivisés par la suite.

In [9]:
print("Taille Train: {}, taille test: {}".format(len(df), len(df_test)))
df = df.append(df_test).reset_index()

# On supprime 4 lignes où le genre n'est pas renseigné
df = df[df['CODE_GENDER'] != 'XNA']

# 3 variables sont binaires, on les change directement
for binary_feat in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    df[binary_feat], uniques = pd.factorize(df[binary_feat])

# On encode nos variables catégorielles
df, cat_cols = one_hot_encoder(df, nan_as_category=False)

# On remplace les valeurs aberrantes par np.nan
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

# On crée quelques variables
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

Taille Train: 307511, taille test: 48744


In [10]:
# Regardons les valeurs manquantes
df_nan = df.isnull().mean()
df_nan = df_nan.sort_values(ascending=False)
liste_nan = []

for x, y in zip(df_nan.index, df_nan):
    liste_nan.append((x, y))

liste_nan[:15]

[('EXT_SOURCE_1', 0.5443016300305122),
 ('EXT_SOURCE_3', 0.19546050397051518),
 ('DAYS_EMPLOYED', 0.18146756079281182),
 ('DAYS_EMPLOYED_PERC', 0.18146756079281182),
 ('TARGET', 0.1368248790880587),
 ('EXT_SOURCE_2', 0.0018750824559088956),
 ('PAYMENT_RATE', 0.0001010523479232339),
 ('ANNUITY_INCOME_PERC', 0.0001010523479232339),
 ('AMT_ANNUITY', 0.0001010523479232339),
 ('CNT_FAM_MEMBERS', 5.61401932906855e-06),
 ('INCOME_PER_PERSON', 5.61401932906855e-06),
 ('ORGANIZATION_TYPE_Industry: type 8', 0.0),
 ('ORGANIZATION_TYPE_Industry: type 7', 0.0),
 ('ORGANIZATION_TYPE_Industry: type 1', 0.0),
 ('ORGANIZATION_TYPE_Industry: type 9', 0.0)]

In [11]:
# On remplace par la valeur médiane pour quelques variables
cols = ['DAYS_EMPLOYED', 'DAYS_EMPLOYED_PERC', 'AMT_ANNUITY',
        'PAYMENT_RATE', 'ANNUITY_INCOME_PERC', 'INCOME_PER_PERSON',
        'CNT_FAM_MEMBERS']

for col in cols:
    df[col] = df[col].fillna(df[col].median())

In [12]:
df.isnull().mean()

index                  0.000000
SK_ID_CURR             0.000000
TARGET                 0.136825
CODE_GENDER            0.000000
FLAG_OWN_CAR           0.000000
                         ...   
DAYS_EMPLOYED_PERC     0.000000
INCOME_CREDIT_PERC     0.000000
INCOME_PER_PERSON      0.000000
ANNUITY_INCOME_PERC    0.000000
PAYMENT_RATE           0.000000
Length: 138, dtype: float64

In [13]:
# Enfin, on supprime les éventuelles valeurs infinies
del_inf(df, drop=True)

Shape avant : (356251, 138) 

Shape après : (356251, 138) 



## Bureau & bureau_balance (2/6)

In [14]:
# On supprime quelques colonnes
bb_cols_drop = ['CREDIT_CURRENCY', 'DAYS_CREDIT_UPDATE', 'DAYS_ENDDATE_FACT']
bureau = bureau.drop(columns = bb_cols_drop)

In [15]:
bureau.isnull().sum()

SK_ID_CURR                      0
SK_ID_BUREAU                    0
CREDIT_ACTIVE                   0
DAYS_CREDIT                     0
CREDIT_DAY_OVERDUE              0
DAYS_CREDIT_ENDDATE        105553
AMT_CREDIT_MAX_OVERDUE    1124488
CNT_CREDIT_PROLONG              0
AMT_CREDIT_SUM                 13
AMT_CREDIT_SUM_DEBT        257669
AMT_CREDIT_SUM_LIMIT       591780
AMT_CREDIT_SUM_OVERDUE          0
CREDIT_TYPE                     0
AMT_ANNUITY               1226791
dtype: int64

In [16]:
for col in bureau.columns:
    bureau[col].fillna(bureau[col], inplace=True)

In [17]:
bureau_balance.isnull().sum()

SK_ID_BUREAU      0
MONTHS_BALANCE    0
STATUS            0
dtype: int64

In [18]:
# On gère nos variables catégorielles
bureau_balance, bb_cat = one_hot_encoder(bureau_balance, nan_as_category=False)
bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category=False)
    
# On fait une aggrégation sur bureau_balance et on joint avec bureau
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}

for col in bb_cat:
    bb_aggregations[col] = ['mean']

bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    
# Ce que l'on va utiliser pour l'aggrégation
num_aggregations = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'AMT_ANNUITY': ['max', 'mean'],
    'CNT_CREDIT_PROLONG': ['sum'],
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_SIZE': ['mean', 'sum']
}

# On prend la moyenne des variables catégorielles
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean']
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']

# On agrège
bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

# On sépare en crédits actifs / crédits terminées pour les variables numériques
active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')

closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')

In [19]:
# On vérifie s'il y a des valeurs manquantes
bureau_agg.isnull().sum()

BURO_DAYS_CREDIT_MIN                    0
BURO_DAYS_CREDIT_MAX                    0
BURO_DAYS_CREDIT_MEAN                   0
BURO_DAYS_CREDIT_VAR                41520
BURO_DAYS_CREDIT_ENDDATE_MIN         2585
                                    ...  
CLOSED_CNT_CREDIT_PROLONG_SUM       37886
CLOSED_MONTHS_BALANCE_MIN_MIN      187316
CLOSED_MONTHS_BALANCE_MAX_MAX      187316
CLOSED_MONTHS_BALANCE_SIZE_MEAN    187316
CLOSED_MONTHS_BALANCE_SIZE_SUM      37886
Length: 105, dtype: int64

In [20]:
# On s'occupe des valeurs manquantes
data_fill_nan(bureau_agg)

In [21]:
# On vérifie
bureau_agg.isnull().sum().max()

0

In [22]:
# Enfin, on supprime les éventuelles valeurs infinies
del_inf(bureau_agg)

Shape avant : (305811, 105) 

Shape après : (305811, 105) 



## Previous application (3/6)

In [23]:
# On supprime quelques colonnes
prev_cols_drop = ['NAME_SELLER_INDUSTRY', 'AMT_GOODS_PRICE', 'CHANNEL_TYPE','HOUR_APPR_PROCESS_START',
                  'NAME_TYPE_SUITE','NFLAG_LAST_APPL_IN_DAY','SELLERPLACE_AREA',
                  'WEEKDAY_APPR_PROCESS_START']

prev_app = prev_app.drop(columns = prev_cols_drop)

In [24]:
# On gère nos variables catégorielles
prev_app, cat_cols = one_hot_encoder(prev_app, nan_as_category=False)

# On remplace les valeurs aberrantes par np.nan
prev_app['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev_app['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

# On rajoute une variable
prev_app['APP_CREDIT_PERC'] = prev_app['AMT_APPLICATION'] / prev_app['AMT_CREDIT']

# Ce que l'on va utiliser pour l'aggrégation
num_aggregations = {
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
}

# On prend la moyenne des variables catégorielles
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']

# On agrège
prev_agg = prev_app.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

# On sépare en applications actives / applications closes pour les variables numériques
approved = prev_app[prev_app['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

refused = prev_app[prev_app['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')

In [25]:
# On vérifie s'il y a des valeurs manquantes
prev_agg.isnull().sum()

PREV_AMT_ANNUITY_MIN             480
PREV_AMT_ANNUITY_MAX             480
PREV_AMT_ANNUITY_MEAN            480
PREV_AMT_APPLICATION_MIN           0
PREV_AMT_APPLICATION_MAX           0
                               ...  
REFUSED_DAYS_DECISION_MIN     220580
REFUSED_DAYS_DECISION_MAX     220580
REFUSED_DAYS_DECISION_MEAN    220580
REFUSED_CNT_PAYMENT_MEAN      230761
REFUSED_CNT_PAYMENT_SUM       220580
Length: 182, dtype: int64

In [26]:
# On s'occupe des valeurs manquantes
data_fill_nan(prev_agg)

In [27]:
# On vérifie
prev_agg.isnull().sum().max()

0

In [28]:
# Enfin, on supprime les éventuelles valeurs infinies
del_inf(prev_agg)

Shape avant : (338857, 182) 

Shape après : (338857, 182) 



## POS balance (4/6)

In [29]:
# On gère nos variables catégorielles
pos_cash, cat_cols = one_hot_encoder(pos_cash, nan_as_category=False)

# Ce que l'on va utiliser pour l'aggrégation
aggregations = {
    'MONTHS_BALANCE': ['max', 'mean', 'size'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}

# On prend la moyenne des variables catégorielles
for cat in cat_cols:
    aggregations[cat] = ['mean']

# On agrège
pos_agg = pos_cash.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])

# On compte le nombre de compte POS
pos_agg['POS_COUNT'] = pos_cash.groupby('SK_ID_CURR').size()

In [30]:
# On vérifie s'il y a des valeurs manquantes
pos_agg.isnull().sum()

POS_MONTHS_BALANCE_MAX                                 0
POS_MONTHS_BALANCE_MEAN                                0
POS_MONTHS_BALANCE_SIZE                                0
POS_SK_DPD_MAX                                         0
POS_SK_DPD_MEAN                                        0
POS_SK_DPD_DEF_MAX                                     0
POS_SK_DPD_DEF_MEAN                                    0
POS_NAME_CONTRACT_STATUS_Active_MEAN                   0
POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN           0
POS_NAME_CONTRACT_STATUS_Approved_MEAN                 0
POS_NAME_CONTRACT_STATUS_Canceled_MEAN                 0
POS_NAME_CONTRACT_STATUS_Completed_MEAN                0
POS_NAME_CONTRACT_STATUS_Demand_MEAN                   0
POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN    0
POS_NAME_CONTRACT_STATUS_Signed_MEAN                   0
POS_NAME_CONTRACT_STATUS_XNA_MEAN                      0
POS_COUNT                                              0
dtype: int64

In [31]:
# Enfin, on supprime les éventuelles valeurs infinies
del_inf(pos_agg)

Shape avant : (337252, 17) 

Shape après : (337252, 17) 



## Installments payments (5/6)

In [32]:
# On vérifie s'il y a des valeurs manquantes
install_pay.isnull().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

In [33]:
# On remplace par la médiane
for col in install_pay.columns:
    install_pay[col] = install_pay[col].fillna(install_pay[col].median())

In [34]:
# On gère nos variables catégorielles
install_pay, cat_cols = one_hot_encoder(install_pay, nan_as_category=False)

# On crée de nouvelles variables
install_pay['PAYMENT_PERC'] = install_pay['AMT_PAYMENT'] / install_pay['AMT_INSTALMENT']
install_pay['PAYMENT_DIFF'] = install_pay['AMT_INSTALMENT'] - install_pay['AMT_PAYMENT']

# Et on veut des valeurs positives pour les nombres de jours de nos nouvelles variables
install_pay['DPD'] = install_pay['DAYS_ENTRY_PAYMENT'] - install_pay['DAYS_INSTALMENT']
install_pay['DBD'] = install_pay['DAYS_INSTALMENT'] - install_pay['DAYS_ENTRY_PAYMENT']
install_pay['DPD'] = install_pay['DPD'].apply(lambda x: x if x > 0 else 0)
install_pay['DBD'] = install_pay['DBD'].apply(lambda x: x if x > 0 else 0)

# Ce que l'on va utiliser pour l'aggrégation
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
}

# On prend la moyenne des variables catégorielles
for cat in cat_cols:
    aggregations[cat] = ['mean']

# On agrège
ins_agg = install_pay.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])

# On compte le nombre de compte
ins_agg['INSTAL_COUNT'] = install_pay.groupby('SK_ID_CURR').size()

In [35]:
# On vérifie s'il y a des valeurs manquantes
ins_agg.isnull().sum()

INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE    0
INSTAL_DPD_MAX                           0
INSTAL_DPD_MEAN                          0
INSTAL_DPD_SUM                           0
INSTAL_DBD_MAX                           0
INSTAL_DBD_MEAN                          0
INSTAL_DBD_SUM                           0
INSTAL_PAYMENT_PERC_MAX                  0
INSTAL_PAYMENT_PERC_MEAN                 0
INSTAL_PAYMENT_PERC_SUM                  0
INSTAL_PAYMENT_DIFF_MAX                  0
INSTAL_PAYMENT_DIFF_MEAN                 0
INSTAL_PAYMENT_DIFF_SUM                  0
INSTAL_AMT_INSTALMENT_MAX                0
INSTAL_AMT_INSTALMENT_MEAN               0
INSTAL_AMT_INSTALMENT_SUM                0
INSTAL_AMT_PAYMENT_MIN                   0
INSTAL_AMT_PAYMENT_MAX                   0
INSTAL_AMT_PAYMENT_MEAN                  0
INSTAL_AMT_PAYMENT_SUM                   0
INSTAL_DAYS_ENTRY_PAYMENT_MAX            0
INSTAL_DAYS_ENTRY_PAYMENT_MEAN           0
INSTAL_DAYS_ENTRY_PAYMENT_SUM            0
INSTAL_COUN

In [36]:
# On s'occupe des valeurs manquantes
data_fill_nan(ins_agg)

In [37]:
# Enfin, on supprime les éventuelles valeurs infinies
del_inf(ins_agg)

Shape avant : (339587, 24) 

Shape après : (339587, 24) 



## Credit card balance (6/6)

In [38]:
# On vérifie s'il y a des valeurs manquantes
ccb.isnull().sum()

SK_ID_PREV                         0
SK_ID_CURR                         0
MONTHS_BALANCE                     0
AMT_BALANCE                        0
AMT_CREDIT_LIMIT_ACTUAL            0
AMT_DRAWINGS_ATM_CURRENT      749816
AMT_DRAWINGS_CURRENT               0
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
AMT_INST_MIN_REGULARITY       305236
AMT_PAYMENT_CURRENT           767988
AMT_PAYMENT_TOTAL_CURRENT          0
AMT_RECEIVABLE_PRINCIPAL           0
AMT_RECIVABLE                      0
AMT_TOTAL_RECEIVABLE               0
CNT_DRAWINGS_ATM_CURRENT      749816
CNT_DRAWINGS_CURRENT               0
CNT_DRAWINGS_OTHER_CURRENT    749816
CNT_DRAWINGS_POS_CURRENT      749816
CNT_INSTALMENT_MATURE_CUM     305236
NAME_CONTRACT_STATUS               0
SK_DPD                             0
SK_DPD_DEF                         0
dtype: int64

In [39]:
# On remplace par 0 les valeurs manquantes
for col in ccb.columns:
    ccb[col] = ccb[col].fillna(0)

In [40]:
# On gère nos variables catégorielles
ccb, cat_cols = one_hot_encoder(ccb, nan_as_category=False)

# Ce que l'on va utiliser pour l'aggrégation
ccb.drop(['SK_ID_PREV'], axis= 1, inplace = True)

# On agrège
cc_agg = ccb.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum'])
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])

# On compte le nombre de cartes de crédit
cc_agg['CC_COUNT'] = ccb.groupby('SK_ID_CURR').size()

In [41]:
# On vérifie qu'il n'y a pas de valeurs manquantes
cc_agg.isnull().sum().max()

0

In [42]:
# Enfin, on supprime les éventuelles valeurs infinies
del_inf(cc_agg)

Shape avant : (103558, 109) 

Shape après : (103558, 109) 



## Join

In [43]:
# On joint nos jeux de données
df = df.join(bureau_agg, how='left', on='SK_ID_CURR')
df = df.join(prev_agg, how='left', on='SK_ID_CURR')
df = df.join(pos_agg, how='left', on='SK_ID_CURR')
df = df.join(ins_agg, how='left', on='SK_ID_CURR')
df = df.join(cc_agg, how='left', on='SK_ID_CURR')

On réalise un dernier preprocessing :

In [44]:
# Valeurs manquantes
data_fill_nan(df)

# Les doublons
df.duplicated().sum()

# Les valeurs infinies
del_inf(df, drop=True)

Shape avant : (356251, 575) 

Shape après : (356251, 575) 



In [45]:
# On vérifie qu'il n'y a pas de valeurs manquantes
df_nan = df.isnull().mean()
df_nan = df_nan.sort_values(ascending=False)
liste_nan = []

for x, y in zip(df_nan.index, df_nan):
    liste_nan.append((x, y))

liste_nan[:15]

[('TARGET', 0.1368248790880587),
 ('index', 0.0),
 ('APPROVED_APP_CREDIT_PERC_MIN', 0.0),
 ('APPROVED_AMT_APPLICATION_MIN', 0.0),
 ('APPROVED_AMT_APPLICATION_MAX', 0.0),
 ('APPROVED_AMT_APPLICATION_MEAN', 0.0),
 ('APPROVED_AMT_CREDIT_MIN', 0.0),
 ('APPROVED_AMT_CREDIT_MAX', 0.0),
 ('APPROVED_AMT_CREDIT_MEAN', 0.0),
 ('APPROVED_APP_CREDIT_PERC_MAX', 0.0),
 ('APPROVED_AMT_ANNUITY_MAX', 0.0),
 ('APPROVED_APP_CREDIT_PERC_MEAN', 0.0),
 ('APPROVED_APP_CREDIT_PERC_VAR', 0.0),
 ('APPROVED_AMT_DOWN_PAYMENT_MIN', 0.0),
 ('APPROVED_AMT_DOWN_PAYMENT_MAX', 0.0)]

In [46]:
# On enregistre notre jeu de données avec le preprocessing réalisé
df_csv = df.to_csv('data_preprocessed.csv')

## Corrélations

On cherche à savoir les corrélations entre nos variables et notre cible ("TARGET").

In [47]:
# Nos corrélations :
correlations = df.corr()['TARGET'].sort_values()

In [48]:
# Top 10 des corrélations positives et négatives
print('Corrélations positives :\n')
print(correlations.tail(10), '\n\n')
print('Corrélations négatives:\n')
print(correlations.head(10))

Corrélations positives :

DAYS_BIRTH                                          0.078242
BURO_DAYS_CREDIT_MEAN                               0.083961
TARGET                                              1.000000
PREV_NAME_GOODS_CATEGORY_House Construction_MEAN         NaN
CC_SK_DPD_MIN                                            NaN
CC_SK_DPD_DEF_MIN                                        NaN
CC_NAME_CONTRACT_STATUS_Approved_MIN                     NaN
CC_NAME_CONTRACT_STATUS_Demand_MIN                       NaN
CC_NAME_CONTRACT_STATUS_Refused_MIN                      NaN
CC_NAME_CONTRACT_STATUS_Sent proposal_MIN                NaN
Name: TARGET, dtype: float64 


Corrélations négatives:

EXT_SOURCE_2                              -0.160295
EXT_SOURCE_3                              -0.156026
EXT_SOURCE_1                              -0.098875
BURO_CREDIT_ACTIVE_Closed_MEAN            -0.076500
PREV_CODE_REJECT_REASON_XAP_MEAN          -0.074356
PREV_NAME_CONTRACT_STATUS_Approved_MEAN   -0.06

Il est normal d'avoir autant de NaNs pour certaines variables, même sans avoir de valeurs manquantes dans notre jeu de données : nous n'avons quasiment pas variation dans certaines colonnes. En effet, pour certaines, nous avons remplacer beaucoup de valeurs par 0 ou par la médiane. Pour information :

https://stackoverflow.com/questions/22655667/dataframe-correlation-produces-nan-although-its-values-are-all-integers