# I. Importation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
import cv2
from PIL import Image
import shap
import time 
import pickle

from yellowbrick.classifier import ClassificationReport
from sklearn import preprocessing
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedKFold, KFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.utils import resample
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import xgboost


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# II. File loading

In [None]:
# for clarification, print the explanations of the datasets
plt.figure(figsize = (20, 15))
img_array = np.array(Image.open('/kaggle/input/explanation-datasets/Capture dcran (247).png'))
plt.imshow(img_array)

In [None]:
# Load all datasets
application_test = pd.read_csv("/kaggle/input/p7-all-files/application_test.csv")
application_train = pd.read_csv("/kaggle/input/p7-all-files/application_train.csv")
bureau = pd.read_csv("/kaggle/input/p7-all-files/bureau.csv")
bureau_balance = pd.read_csv("/kaggle/input/p7-all-files/bureau_balance.csv")
credit_card_balance = pd.read_csv("/kaggle/input/p7-all-files/credit_card_balance.csv")
HomeCredit_columns_description = pd.read_csv("/kaggle/input/p7-all-files/HomeCredit_columns_description.csv")
installments_payments = pd.read_csv("/kaggle/input/p7-all-files/installments_payments.csv")
POS_CASH_balance = pd.read_csv("/kaggle/input/p7-all-files/POS_CASH_balance.csv")
previous_application = pd.read_csv("/kaggle/input/p7-all-files/previous_application.csv")

In [None]:
# look at shapes of datasets
print('Shape of dataset application_test : {}'.format(application_test.shape))
print('Shape of dataset application_train : {}'.format(application_train.shape))
print('Shape of dataset bureau : {}'.format(bureau.shape))
print('Shape of dataset bureau_balance : {}'.format(bureau_balance.shape))
print('Shape of dataset credit_card_balance : {}'.format(credit_card_balance.shape))
print('Shape of dataset HomeCredit_columns_description : {}'.format(HomeCredit_columns_description.shape))
print('Shape of dataset installments_payments : {}'.format(installments_payments.shape))
print('Shape of dataset POS_CASH_balance : {}'.format(POS_CASH_balance.shape))
print('Shape of dataset previous_application : {}'.format(previous_application.shape))

# III. File cleaning 

## A - Application_train/application_test

In [None]:
application_train = pd.read_csv("/kaggle/input/p7-all-files/application_train.csv")
application_test = pd.read_csv("/kaggle/input/p7-all-files/application_test.csv")

In [None]:
# delete the 4 rows with unknown gender
application_train = application_train[application_train['CODE_GENDER'] != 'XNA']
# Drop useless column : name_type_suite 
application_train = application_train.drop('NAME_TYPE_SUITE', axis=1)
application_test = application_test.drop('NAME_TYPE_SUITE', axis=1)
# drop FLAG_MOBIL because there is only one client without mobile phone
application_train = application_train.drop('FLAG_MOBIL', axis=1)
application_test = application_test.drop('FLAG_MOBIL', axis=1)
# There is a error for one value (positive) 
application_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
application_test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

In [None]:
# Label encoder on binary variables
le = preprocessing.LabelEncoder()
# transform Y to 1 and N to 0 
application_train['FLAG_OWN_CAR'] = le.fit_transform(application_train['FLAG_OWN_CAR'])
application_test['FLAG_OWN_CAR'] = le.fit_transform(application_test['FLAG_OWN_CAR'])
application_train['FLAG_OWN_REALTY'] = le.fit_transform(application_train['FLAG_OWN_REALTY'])
application_test['FLAG_OWN_REALTY'] = le.fit_transform(application_test['FLAG_OWN_REALTY'])
#  Changing unknown family status with the most observed category
application_train['NAME_FAMILY_STATUS'].replace('Unknown', 'Married', inplace=True)

In [None]:
# Label male = 1, female = 0
application_train['CODE_GENDER'] = le.fit_transform(application_train['CODE_GENDER'])
application_test['CODE_GENDER'] = le.fit_transform(application_test['CODE_GENDER'])

In [None]:
# Create next feature : Total documents demonstrated
# we do a sum of documents presented 
temp_sum1 = application_train.loc[:, 'FLAG_DOCUMENT_2':'FLAG_DOCUMENT_21'].sum(axis=1)
temp_sum2 = application_test.loc[:, 'FLAG_DOCUMENT_2':'FLAG_DOCUMENT_21'].sum(axis=1)
application_train['FLAG_TOTAL_DOC_NUM'] = temp_sum1
application_test['FLAG_TOTAL_DOC_NUM'] = temp_sum2
# Drop the features we don't need anymore
application_train.drop(application_train.iloc[:, 94:114], axis=1, inplace=True)
application_test.drop(application_test.iloc[:, 93:113], axis=1, inplace=True)

In [None]:
# change string 'nan' to NaN values
application_train['EMERGENCYSTATE_MODE'] = application_train['EMERGENCYSTATE_MODE'].replace('nan', np.NaN)
application_test['EMERGENCYSTATE_MODE'] = application_test['EMERGENCYSTATE_MODE'].replace('nan', np.NaN)
# Change yes and no to values 1 and 0
application_train['EMERGENCYSTATE_MODE'] = np.where(application_train['EMERGENCYSTATE_MODE'].isnull(),
                                                    application_train['EMERGENCYSTATE_MODE'],
                                                    le.fit_transform(application_train['EMERGENCYSTATE_MODE']))
application_test['EMERGENCYSTATE_MODE'] = np.where(application_test['EMERGENCYSTATE_MODE'].isnull(),
                                                    application_test['EMERGENCYSTATE_MODE'],
                                                    le.fit_transform(application_test['EMERGENCYSTATE_MODE']))

In [None]:
# look at categorical values 
application_train['ORGANIZATION_TYPE'].value_counts()

In [None]:
# Rename the type of work clients do to have less categories
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Business Entity Type 3', 'ORGANIZATION_TYPE'] = 'Business Entity'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Business Entity Type 2', 'ORGANIZATION_TYPE'] = 'Business Entity'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Business Entity Type 1', 'ORGANIZATION_TYPE'] = 'Business Entity'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Transport: type 1', 'ORGANIZATION_TYPE'] = 'Transport'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Transport: type 2', 'ORGANIZATION_TYPE'] = 'Transport'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Transport: type 3', 'ORGANIZATION_TYPE'] = 'Transport'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Transport: type 4', 'ORGANIZATION_TYPE'] = 'Transport'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 1', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 2', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 3', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 4', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 5', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 6', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Trade: type 7', 'ORGANIZATION_TYPE'] = 'Trade'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 1', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 2', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 3', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 4', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 5', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 6', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 7', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 8', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 9', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 10', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 11', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 12', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Industry: type 13', 'ORGANIZATION_TYPE'] = 'Industry'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Realtor', 'ORGANIZATION_TYPE'] = 'Housing'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Mobile', 'ORGANIZATION_TYPE'] = 'Telecom'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'Kindergarten', 'ORGANIZATION_TYPE'] = 'School'
application_train.loc[application_train['ORGANIZATION_TYPE'] ==
                      'University', 'ORGANIZATION_TYPE'] = 'School'
# for NAME_FAMILY_STATUS
application_train.loc[application_train['NAME_FAMILY_STATUS'] ==
                      'Civil marriage', 'NAME_FAMILY_STATUS'] = 'Married'

# Changing rare categories of NAME_INCOME_TYPE with the similar categories
# Pensioner will be the category for people who don't earn money while working
application_train.loc[application_train['NAME_INCOME_TYPE'] ==
                      'Maternity leave', 'NAME_INCOME_TYPE'] = 'Pensioner'
application_train.loc[application_train['NAME_INCOME_TYPE'] ==
                      'Unemployed', 'NAME_INCOME_TYPE'] = 'Pensioner'
application_train.loc[application_train['NAME_INCOME_TYPE'] ==
                      'Businessman', 'NAME_INCOME_TYPE'] = 'Working'
application_train.loc[application_train['NAME_INCOME_TYPE'] ==
                      'Student', 'NAME_INCOME_TYPE'] = 'Pensioner'

In [None]:
# We do the same for the testing set
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Business Entity Type 3', 'ORGANIZATION_TYPE'] = 'Business Entity'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Business Entity Type 2', 'ORGANIZATION_TYPE'] = 'Business Entity'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Business Entity Type 1', 'ORGANIZATION_TYPE'] = 'Business Entity'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Transport: type 1', 'ORGANIZATION_TYPE'] = 'Transport'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Transport: type 2', 'ORGANIZATION_TYPE'] = 'Transport'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Transport: type 3', 'ORGANIZATION_TYPE'] = 'Transport'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Transport: type 4', 'ORGANIZATION_TYPE'] = 'Transport'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 1', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 2', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 3', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 4', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 5', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 6', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Trade: type 7', 'ORGANIZATION_TYPE'] = 'Trade'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 1', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 2', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 3', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 4', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 5', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 6', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 7', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 8', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 9', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 10', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 11', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 12', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Industry: type 13', 'ORGANIZATION_TYPE'] = 'Industry'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Realtor', 'ORGANIZATION_TYPE'] = 'Housing'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Mobile', 'ORGANIZATION_TYPE'] = 'Telecom'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'Kindergarten', 'ORGANIZATION_TYPE'] = 'School'
application_test.loc[application_test['ORGANIZATION_TYPE'] ==
                      'University', 'ORGANIZATION_TYPE'] = 'School'
# for NAME_FAMILY_STATUS
application_test.loc[application_test['NAME_FAMILY_STATUS'] ==
                      'Civil marriage', 'NAME_FAMILY_STATUS'] = 'Married'

# Changing rare categories of NAME_INCOME_TYPE with the similar categories
# Pensioner will be the category for people who don't earn money while working
application_test.loc[application_test['NAME_INCOME_TYPE'] ==
                      'Unemployed', 'NAME_INCOME_TYPE'] = 'Pensioner'
application_test.loc[application_test['NAME_INCOME_TYPE'] ==
                      'Businessman', 'NAME_INCOME_TYPE'] = 'Working'
application_test.loc[application_test['NAME_INCOME_TYPE'] ==
                      'Student', 'NAME_INCOME_TYPE'] = 'Pensioner'

In [None]:
# feature engineering for term of credit
application_train['Credit_Duration'] = 365 * (application_train['AMT_CREDIT'] / application_train['AMT_ANNUITY'])
application_test['Credit_Duration'] = 365 * (application_test['AMT_CREDIT'] / application_test['AMT_ANNUITY'])

In [None]:
#application_train.drop(['WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START'], axis=1, inplace=True)
#application_test.drop(['WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START'], axis=1, inplace=True)

In [None]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    # extract categorical features
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [None]:
# apply function to application_train and application_test
application_train_ohe, cat_cols = one_hot_encoder(application_train, nan_as_category=True)
application_test_ohe, cat_cols_test = one_hot_encoder(application_test, nan_as_category=True)

## B - Bureau/Bureau_balance

In [None]:
bureau.describe()

In [None]:
# rename a few categories to have fewer features
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Unknown type of loan', 'CREDIT_TYPE'] = 'Another type of loan'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Mobile operator loan', 'CREDIT_TYPE'] = 'Another type of loan'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Interbank credit', 'CREDIT_TYPE'] = 'Another type of loan'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Loan for purchase of shares (margin lending)', 'CREDIT_TYPE'] = 'Another type of loan'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Loan for the purchase of equipment ', 'CREDIT_TYPE'] = 'Another type of loan'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Real estate loan', 'CREDIT_TYPE'] = 'Mortgage'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Cash loan (non-earmarked)', 'CREDIT_TYPE'] = 'Another type of loan'
bureau.loc[bureau['CREDIT_TYPE'] == 
           'Loan for the purchase of equipment', 'CREDIT_TYPE'] = 'Another type of loan'

In [None]:
# isthere negative values for some features
print(bureau[bureau['AMT_CREDIT_SUM_LIMIT'] < 0]['AMT_CREDIT_SUM_LIMIT'].size)
print(bureau[bureau['AMT_CREDIT_SUM_DEBT'] < 0]['AMT_CREDIT_SUM_DEBT'].size)

In [None]:
# change negative values to positive ones
bureau['AMT_CREDIT_SUM_LIMIT'] = bureau['AMT_CREDIT_SUM_LIMIT'].abs()
bureau['AMT_CREDIT_SUM_DEBT'] = bureau['AMT_CREDIT_SUM_DEBT'].abs()
bureau['DAYS_CREDIT_ENDDATE'] = bureau['DAYS_CREDIT_ENDDATE'].abs()
bureau['DAYS_CREDIT'] = bureau['DAYS_CREDIT'].abs()
bureau['DAYS_CREDIT_UPDATE'] = bureau['DAYS_CREDIT_UPDATE'].abs()
bureau['DAYS_ENDDATE_FACT'] = bureau['DAYS_ENDDATE_FACT'].abs()

In [None]:
# drop columns without any information
bureau.drop('CREDIT_CURRENCY', axis=1, inplace=True)
bureau.drop('DAYS_ENDDATE_FACT', axis=1, inplace=True)

In [None]:
# apoply one hot encoding to categorical values
bureau_ohe, bureau_cat = one_hot_encoder(bureau, nan_as_category=True)
bureau_balance_ohe, bb_cat = one_hot_encoder(bureau_balance, nan_as_category=True)

In [None]:
# apply aggregation to bureau_balance
temp = bureau_balance.loc[:, bureau_balance.columns != 'MONTHS_BALANCE']
bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg('size')
#bureau_balance = temp.groupby('SK_ID_BUREAU').agg('mean') #sum
bureau_balance['NUMB_MONTH_SINCE_APPLIC'] = bb_agg
del temp

In [None]:
# create mode of aggregation for categorical and nuemric columns
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean'] #sum

# define mean or sum to aggregate
num_aggregations = {
    'DAYS_CREDIT': ['mean'],
    'DAYS_CREDIT_ENDDATE': ['mean'],
    'DAYS_CREDIT_UPDATE': ['mean'],
    'CREDIT_DAY_OVERDUE': ['mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['mean'],
    'AMT_CREDIT_SUM_DEBT': ['sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['sum'],
    'AMT_CREDIT_SUM_LIMIT': ['sum'],
    'AMT_ANNUITY': ['mean'],
    'CNT_CREDIT_PROLONG': ['sum']
}

In [None]:
# merge bureau and bureau_balance then apply aggregation
agg = bureau_ohe.merge(bureau_balance_ohe, how='inner', on='SK_ID_BUREAU')
print(agg.shape)

In [None]:
# aggregate to have clean dataset 
bureau_clean = agg.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
print(bureau_clean.shape)
print(application_train_ohe.shape)

In [None]:
# join now with application datasets 
agg_left1 = application_train_ohe.join(bureau_clean, on='SK_ID_CURR', how='left')
agg_left_test = application_test_ohe.join(bureau_clean, on='SK_ID_CURR', how='left')

## C - Previous_application

In [None]:
previous_application.info()

In [None]:
# Change Y to 1 ang N to 0
previous_application['FLAG_LAST_APPL_PER_CONTRACT'] = le.fit_transform(previous_application['FLAG_LAST_APPL_PER_CONTRACT'])

In [None]:
# drop columns with many categories of unknown classification
previous_application = previous_application.loc[:, previous_application.columns != 'NAME_CASH_LOAN_PURPOSE']
previous_application = previous_application.loc[:, previous_application.columns != 'CODE_REJECT_REASON']
previous_application = previous_application.loc[:, previous_application.columns != 'PRODUCT_COMBINATION']
# drop column with too many unknown
previous_application = previous_application.loc[:, previous_application.columns != 'NAME_GOODS_CATEGORY']
previous_application = previous_application.loc[:, previous_application.columns != 'NAME_PRODUCT_TYPE']
previous_application = previous_application.loc[:, previous_application.columns != 'DAYS_FIRST_DRAWING']
# drop with no utility
previous_application = previous_application.loc[:, previous_application.columns != 'WEEKDAY_APPR_PROCESS_START']
previous_application = previous_application.loc[:, previous_application.columns != 'HOUR_APPR_PROCESS_START']
# drop with same description than another feature
previous_application = previous_application.loc[:, previous_application.columns != 'RATE_INTEREST_PRIVILEGED']
previous_application = previous_application.loc[:, previous_application.columns != 'NAME_PORTFOLIO']
previous_application = previous_application.loc[:, previous_application.columns != 'DAYS_LAST_DUE_1ST_VERSION']

In [None]:
# rename a few categories to have fewer features
previous_application.loc[previous_application['NAME_TYPE_SUITE'] == 
           'Group of people', 'NAME_TYPE_SUITE'] = 'Other'
previous_application.loc[previous_application['NAME_TYPE_SUITE'] == 
           'Other_B', 'NAME_TYPE_SUITE'] = 'Other'
previous_application.loc[previous_application['NAME_TYPE_SUITE'] == 
           'Other_A', 'NAME_TYPE_SUITE'] = 'Other'
previous_application.loc[previous_application['CHANNEL_TYPE'] == 
           'Channel of corporate sales', 'CHANNEL_TYPE'] = 'Sales'
previous_application.loc[previous_application['CHANNEL_TYPE'] == 
           'Car dealer', 'CHANNEL_TYPE'] = 'Sales'

In [None]:
# replace aberrant values
previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

In [None]:
# use positive numbers for days
previous_application['AMT_DOWN_PAYMENT'] = previous_application['AMT_DOWN_PAYMENT'].abs()
previous_application['RATE_DOWN_PAYMENT'] = previous_application['RATE_DOWN_PAYMENT'].abs()
previous_application['DAYS_DECISION'] = previous_application['DAYS_DECISION'].abs()
previous_application['DAYS_FIRST_DUE'] = previous_application['DAYS_FIRST_DUE'].abs()
previous_application['DAYS_TERMINATION'] = previous_application['DAYS_TERMINATION'].abs()

In [None]:
# add new features for future analysis
# credit requested / credit given ratio
previous_application['NEW_PREV_APP_CREDIT_RATIO'] = previous_application[
    'AMT_APPLICATION'].div(previous_application['AMT_CREDIT']).replace(np.inf, 0)
# loan installment / credit amount ratio
previous_application['NEW_ANNUITY_CREDIT_RATIO'] = previous_application[
    'AMT_ANNUITY'] / previous_application['AMT_CREDIT']
# credit amount / goods price ratio
previous_application['NEW_CREDIT_GOODS_RATIO'] = previous_application[
    'AMT_CREDIT'].div(previous_application['AMT_GOODS_PRICE']).replace(np.inf, 0)
# interest amount
previous_application['NEW_AMT_INTEREST'] = previous_application[
    'CNT_PAYMENT'] * previous_application['AMT_ANNUITY'] - previous_application['AMT_CREDIT']
# interest ratio
previous_application['NEW_INTEREST_RATIO'] = previous_application[
    'NEW_AMT_INTEREST'] / previous_application['AMT_CREDIT']
# needed amount / credit amount
previous_application['NEW_AMT_NEEDED_CREDIT_RATIO'] = (previous_application[
    'AMT_GOODS_PRICE'] - previous_application['AMT_DOWN_PAYMENT']) / \
                                            previous_application['AMT_CREDIT']

In [None]:
# use ohe function
previous_application_ohe, previous_application_cat = one_hot_encoder(previous_application, nan_as_category=True)

In [None]:
 # define method for aggregation
num_aggregations = {
        'AMT_ANNUITY': ['mean'],
        'AMT_APPLICATION': ['mean'],
        'AMT_CREDIT': ['mean'],
        'AMT_DOWN_PAYMENT': ['mean'],
        'AMT_GOODS_PRICE': ['mean'],
        'RATE_DOWN_PAYMENT': ['mean'],
        'DAYS_DECISION': ['mean'],
        'CNT_PAYMENT': ['mean'],
    }
# Previous applications categorical features
cat_aggregations = {}
for cat in previous_application_cat:
    cat_aggregations[cat] = ['mean']

In [None]:
previous_application_ohe.filter(regex='NAME_CONTRACT_STATUS')

In [None]:
# aggregate to have new dataset
previous_application_clean = previous_application_ohe.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
previous_application_clean.shape

In [None]:
# new join to complete first df
agg_left2 = agg_left1.join(previous_application_clean, how='left', on='SK_ID_CURR',rsuffix='previous')
agg_left_test = agg_left_test.join(previous_application_clean, how='left', on='SK_ID_CURR',rsuffix='previous')

## D - Installment_payments

In [None]:
installments_payments.info()

In [None]:
# Percentage and difference paid in each installment
installments_payments['PERC_PAYM_AMT'] = installments_payments['AMT_PAYMENT'] / installments_payments['AMT_INSTALMENT'] 
# Calculate difference between theoretical and actual payment
installments_payments['DAYS_PAST_DUE'] = installments_payments['DAYS_ENTRY_PAYMENT'] - installments_payments['DAYS_INSTALMENT']

In [None]:
# drop columns we already used
installments_payments = installments_payments.drop(['DAYS_ENTRY_PAYMENT',
                                                   'DAYS_INSTALMENT',
                                                   'AMT_PAYMENT',
                                                   'AMT_INSTALMENT',
                                                   'NUM_INSTALMENT_VERSION'] , axis=1)

In [None]:
# define method for aggregation
num_aggregations = {
        'DAYS_PAST_DUE': ['sum'],
        'PERC_PAYM_AMT': ['mean']}
installments_payments_clean = installments_payments.groupby('SK_ID_CURR').agg(num_aggregations)

In [None]:
# New join with previous df
agg_left3 = agg_left2.join(installments_payments_clean, how='left', on='SK_ID_CURR')
agg_left_test = agg_left_test.join(installments_payments_clean, how='left', on='SK_ID_CURR')
print(agg_left_test.shape)
print(agg_left2.shape)

## E - POS_CASH_balance

In [None]:
POS_CASH_balance.describe()

In [None]:
POS_CASH_balance.info()

In [None]:
# change unknown with most common
POS_CASH_balance.loc[POS_CASH_balance['NAME_CONTRACT_STATUS'] == 
           'XNA', 'NAME_CONTRACT_STATUS'] = 'Active'

In [None]:
# apply OHE
POS_CASH_balance_ohe, POS_CASH_balance_cat_cols = one_hot_encoder(POS_CASH_balance)

In [None]:
# define method for aggregation 
aggregations = {
    'MONTHS_BALANCE': ['size'],
    'SK_DPD': ['mean'],
    'SK_DPD_DEF': ['mean'],
    'CNT_INSTALMENT': ['mean'],
    'CNT_INSTALMENT_FUTURE': ['mean']
}
for cat in POS_CASH_balance_cat_cols:
    aggregations[cat] = ['mean'] #sum
POS_CASH_balance_clean = POS_CASH_balance_ohe.groupby('SK_ID_CURR').agg(aggregations)

In [None]:
# new join with previous df
agg_left4 = agg_left3.join(POS_CASH_balance_clean, how='left', on='SK_ID_CURR', rsuffix='previous_POS_')
agg_left_test = agg_left_test.join(POS_CASH_balance_clean, how='left', on='SK_ID_CURR', rsuffix='previous_POS_')
print(agg_left_test.shape)
print(agg_left4.shape)

## F - credit_card_balance

In [None]:
credit_card_balance.info()

In [None]:
# drop some features
credit_card_balance = credit_card_balance.drop(['NAME_CONTRACT_STATUS',
                                               'SK_DPD',
                                               'SK_DPD_DEF'] , axis=1)

In [None]:
# look at positive and negative values
print(credit_card_balance[credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] > 0]['AMT_RECEIVABLE_PRINCIPAL'].size)
print(credit_card_balance[credit_card_balance['AMT_TOTAL_RECEIVABLE'] > 0]['AMT_TOTAL_RECEIVABLE'].size)
print(credit_card_balance[credit_card_balance['AMT_RECIVABLE'] > 0]['AMT_RECIVABLE'].size)

In [None]:
# change negative values to posiive ones
credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] = credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'].abs()
credit_card_balance['AMT_TOTAL_RECEIVABLE'] = credit_card_balance['AMT_TOTAL_RECEIVABLE'].abs()
credit_card_balance['AMT_RECIVABLE'] = credit_card_balance['AMT_RECIVABLE'].abs()

In [None]:
# aggregations for categorical features
credit_card_balance_ohe, credit_card_balance_cat_cols = one_hot_encoder(credit_card_balance)
aggregations = {}
for cat in credit_card_balance_cat_cols:
    aggregations[cat] = ['mean'] #sum
for col in credit_card_balance.select_dtypes(exclude=['object']).columns:
    aggregations[col] = ['mean']

In [None]:
# new join with previous df
credit_card_balance_clean = credit_card_balance_ohe.groupby('SK_ID_CURR').agg(aggregations)
agg_left5 = agg_left4.join(credit_card_balance_clean, how='left', on='SK_ID_CURR', rsuffix='credit_card')
agg_left_test = agg_left_test.join(credit_card_balance_clean, how='left', on='SK_ID_CURR', rsuffix='credit_card')

In [None]:
print(agg_test.shape)
print(agg_left5.shape)

In [None]:
# save both df
agg_left5.to_csv('agg 05-05.csv', index = False)
agg_left_test.to_csv('agg_test.csv', index = False)

## F - Missing values

In [None]:
agg = pd.read_csv('/kaggle/input/agg1305/agg 05-05.csv')
agg_test = pd.read_csv('/kaggle/input/agg-test/agg_test.csv')

In [None]:
# function to look at missing values
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending=True)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
# many features have a lot of missing values
missing_data(agg)

In [None]:
# let's look at the features with most missing values
plt.style.use('ggplot')
sns.displot(missing_data(agg)['Percent'])
plt.xlabel('Percentage of missing values')
plt.ylabel('Number of features')
plt.show()

In [None]:
# we choose a threshold of 48% for keeping features
index = missing_data(agg).loc[missing_data(agg)['Percent'] < 48].index
agg = agg[index]

In [None]:
# same for application_test
index_test = index.drop(['TARGET'])
agg_test = agg_test[index_test]

In [None]:
# create a copy before feature engineering and replace inf with nan values
copy = agg.copy()
copy_test = agg_test.copy()
copy = copy.replace(np.inf, np.nan)
copy_test = copy_test.replace(np.inf, np.nan)

## G - Feature engineering

In [None]:
# change negative values for days for positive values
copy[copy.columns[(copy < 0).any()].tolist()] = copy[
    copy.columns[(copy < 0).any()].tolist()]*-1
copy_test[copy_test.columns[(copy_test < 0).any()].tolist()] = copy_test[
    copy_test.columns[(copy_test < 0).any()].tolist()]*-1

In [None]:
# credit requested/credit given ratio
copy['new_app_credit_ratio'] = copy["('AMT_APPLICATION', 'mean')"].div(copy[
    "('AMT_CREDIT', 'mean')"]).replace(np.inf, 0)
# loan installment / credit amount ratio
copy['NEW_ANNUITY_CREDIT_RATIO'] = copy["('AMT_ANNUITY', 'mean').1"] / copy[
    "('AMT_CREDIT', 'mean')"]
# credit amount / goods price ratio
copy['NEW_CREDIT_GOODS_RATIO'] = copy["('AMT_CREDIT', 'mean')"].div(copy[
    "('AMT_GOODS_PRICE', 'mean')"]).replace(np.inf, 0)
# interest amount
copy['NEW_AMT_INTEREST'] = copy["('CNT_PAYMENT', 'mean')"] * copy[
    "('AMT_ANNUITY', 'mean').1"] - copy["('AMT_CREDIT', 'mean')"]
# interest ratio
copy['NEW_INTEREST_RATIO'] = copy['NEW_AMT_INTEREST'] / copy[
    "('AMT_CREDIT', 'mean')"]
# needed amount / credit amount 
copy['NEW_AMT_NEEDED_CREDIT_RATIO'] = (copy["('AMT_GOODS_PRICE', 'mean')"] - copy["('AMT_DOWN_PAYMENT', 'mean')"]) / \
                                             copy["('AMT_CREDIT', 'mean')"]

In [None]:
# we do the same for application_test to have the same features to compare
# credit requested/credit given ratio
copy_test['new_app_credit_ratio'] = copy_test["('AMT_APPLICATION', 'mean')"].div(
    copy_test["('AMT_CREDIT', 'mean')"]).replace(np.inf, 0)
# loan installment / credit amount ratio
copy_test['NEW_ANNUITY_CREDIT_RATIO'] = copy_test[
    "('AMT_ANNUITY', 'mean').1"]/copy_test["('AMT_CREDIT', 'mean')"]
# credit amount / goods price ratio
copy_test['NEW_CREDIT_GOODS_RATIO'] = copy_test[
    "('AMT_CREDIT', 'mean')"].div(copy_test["('AMT_GOODS_PRICE', 'mean')"]).replace(np.inf, 0)
# interest amount
copy_test['NEW_AMT_INTEREST'] = copy_test[
    "('CNT_PAYMENT', 'mean')"]*copy_test["('AMT_ANNUITY', 'mean').1"]-copy_test["('AMT_CREDIT', 'mean')"]
# interest ratio
copy_test['NEW_INTEREST_RATIO'] = copy_test[
    'NEW_AMT_INTEREST']/copy_test["('AMT_CREDIT', 'mean')"]
# needed amount / credit amount 
copy_test['NEW_AMT_NEEDED_CREDIT_RATIO'] = (copy_test["('AMT_GOODS_PRICE', 'mean')"]-copy_test["('AMT_DOWN_PAYMENT', 'mean')"]) / \
                                             copy_test["('AMT_CREDIT', 'mean')"]

In [None]:
# delete weekdays 
copy.drop(copy.iloc[:, 204:212], axis=1, inplace=True)
copy_test.drop(copy_test.iloc[:, 203:211], axis=1, inplace=True)

In [None]:
# delete columns without values
series_temp = pd.Series(copy.columns.values)
# nan columns and xna columns are missing information
df = copy[series_temp[series_temp.str.contains('nan|XNA', #|.1
                                        flags=re.I,
                                        regex = True) == False]]
series_temp = pd.Series(copy_test.columns.values)
df_test = copy_test[series_temp[series_temp.str.contains('nan|XNA', #|.1
                                        flags=re.I,
                                        regex = True) == False]]

In [None]:
print(df.shape)
print(df_test.shape)

In [None]:
# New features for training set and testing set
# Ratio of working time during lifetime
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
# Income/credit amount ratio
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
# Income per person in families
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
# Loan installment/income ratio
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

# same for testing set
df_test['DAYS_EMPLOYED_PERC'] = df_test['DAYS_EMPLOYED'] / df_test['DAYS_BIRTH']
df_test['INCOME_CREDIT_PERC'] = df_test['AMT_INCOME_TOTAL'] / df_test['AMT_CREDIT']
df_test['INCOME_PER_PERSON'] = df_test['AMT_INCOME_TOTAL'] / df_test['CNT_FAM_MEMBERS']
df_test['ANNUITY_INCOME_PERC'] = df_test['AMT_ANNUITY'] / df_test['AMT_INCOME_TOTAL']

## H - Dimensionality reduction

In [None]:
# calculate correlation between features
df_corr = df.corr()

In [None]:
# show the heatmap for correlated features
plt.figure(figsize=(20, 12))
sns.heatmap(df_corr.iloc[:20, :20], cbar=True,
            annot=True, square=True,
            fmt='.2f', annot_kws={'size': 10},
            yticklabels=df_corr.columns.values[:20],
            xticklabels=df_corr.columns.values[:20])
plt.show()

In [None]:
# drop correlation above 0.5 or under -0.5 :
df = df.drop(["('CNT_INSTALMENT_FUTURE', 'mean')",
        "('NAME_CONTRACT_STATUS_Completed', 'mean')",
         "('NAME_CONTRACT_STATUS_Canceled', 'mean')",
        "('NAME_CONTRACT_TYPE_Consumer loans', 'mean')",
        "('NAME_CONTRACT_TYPE_Cash loans', 'mean')",
        "('NAME_CLIENT_TYPE_New', 'mean')",
        "OBS_60_CNT_SOCIAL_CIRCLE",
        'DEF_60_CNT_SOCIAL_CIRCLE',
        'CNT_FAM_MEMBERS',
        'REGION_POPULATION_RELATIVE',
        'FLAG_EMP_PHONE',
        'REG_REGION_NOT_WORK_REGION',
        'REGION_RATING_CLIENT_W_CITY',
        'NAME_CONTRACT_TYPE_Revolving loans',
        'FONDKAPREMONT_MODE_reg oper account'], axis=1)

df_test = df_test.drop(["('CNT_INSTALMENT_FUTURE', 'mean')",
        "('NAME_CONTRACT_STATUS_Completed', 'mean')",
         "('NAME_CONTRACT_STATUS_Canceled', 'mean')",
        "('NAME_CONTRACT_TYPE_Consumer loans', 'mean')",
        "('NAME_CONTRACT_TYPE_Cash loans', 'mean')",
        "('NAME_CLIENT_TYPE_New', 'mean')",
        "OBS_60_CNT_SOCIAL_CIRCLE",
        'DEF_60_CNT_SOCIAL_CIRCLE',
        'CNT_FAM_MEMBERS',
        'REGION_POPULATION_RELATIVE',
        'FLAG_EMP_PHONE',
        'REG_REGION_NOT_WORK_REGION',
        'REGION_RATING_CLIENT_W_CITY',
        'NAME_CONTRACT_TYPE_Revolving loans',
        'FONDKAPREMONT_MODE_reg oper account'], axis=1)

In [None]:
# look at shape of final df (307507, 187)
df.shape

In [None]:
# we save these df to reuse them
df.to_csv('df_train.csv', index = False)
df_test.to_csv('df_test.csv', index = False)

# IV - Exploratory analysis

## A - First steps

In [None]:
# load the clean df 
df = pd.read_csv('/kaggle/input/df-final/df_train.csv')

In [None]:
# look at columns names
df.columns.values

In [None]:
# rename the columns for clean df
new_columns = ['EXT_SOURCE_3', 'DAYS_EMPLOYED', 'AMT_REQ_CREDIT_BUREAU_YEAR',
       'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_MON',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_HOUR', 'RATE_DOWN_PAYMENT',
       'AMT_DOWN_PAYMENT', 'CNT_INSTALMENT',
       'SK_DPD_DEF', 'SK_DPD', 'NAME_CONTRACT_STATUS_Signed',
       'NAME_CONTRACT_STATUS_Returned to the store',
       'NAME_CONTRACT_STATUS_Demand', 'NAME_CONTRACT_STATUS_Canceled_1',
       'NAME_CONTRACT_STATUS_Approved_1',
       'NAME_CONTRACT_STATUS_Amortized debt',
       'NAME_CONTRACT_STATUS_Active', 'MONTHS_BALANCE',
       'AMT_GOODS_PRICE_previous', 'AMT_ANNUITY_previous', 'CNT_PAYMENT',
       'NAME_PAYMENT_TYPE_Non-cash from your account',
       'NAME_PAYMENT_TYPE_Cashless from the account of the employer',
       'NAME_PAYMENT_TYPE_Cash through the bank',
       'NAME_CONTRACT_STATUS_Unused offer',
       'NAME_CONTRACT_STATUS_Refused', 'NAME_YIELD_GROUP_low_normal',
       'NAME_CONTRACT_STATUS_Approved', 'NAME_YIELD_GROUP_middle',
       'NAME_CONTRACT_TYPE_Revolving loans', 'DAYS_DECISION',
       'AMT_CREDIT_prev', 'AMT_APPLICATION', 'NAME_TYPE_SUITE_Children',
       'NAME_SELLER_INDUSTRY_Jewelry', 'NAME_SELLER_INDUSTRY_Clothing',
       'NAME_SELLER_INDUSTRY_Connectivity',
       'NAME_SELLER_INDUSTRY_Construction', 'NAME_TYPE_SUITE_Family',
       'NAME_SELLER_INDUSTRY_Furniture', 'NAME_SELLER_INDUSTRY_Industry',
       'NAME_SELLER_INDUSTRY_MLM partners',
       'NAME_SELLER_INDUSTRY_Tourism', 'NAME_YIELD_GROUP_high',
       'NAME_YIELD_GROUP_low_action',
       'NAME_SELLER_INDUSTRY_Auto technology',
       'NAME_SELLER_INDUSTRY_Consumer electronics',
       'CHANNEL_TYPE_AP+ (Cash loan)', 'CHANNEL_TYPE_Stone',
       'NAME_TYPE_SUITE_Unaccompanied', 'NAME_CLIENT_TYPE_Refreshed',
       'NAME_TYPE_SUITE_Spouse, partner', 'NAME_CLIENT_TYPE_Repeater',
       'NAME_TYPE_SUITE_Other', 'CHANNEL_TYPE_Contact center',
       'CHANNEL_TYPE_Country-wide',
       'CHANNEL_TYPE_Credit and cash offices',
       'CHANNEL_TYPE_Regional / Local', 'CHANNEL_TYPE_Sales',
       'PERC_PAYM_AMT', 'DAYS_PAST_DUE', 'OBS_30_CNT_SOCIAL_CIRCLE',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'EXT_SOURCE_2', 'AMT_GOODS_PRICE',
       'Credit_Duration', 'AMT_ANNUITY', 'DAYS_LAST_PHONE_CHANGE',
       'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_1.0',
       'EMERGENCYSTATE_MODE_0.0', 'SK_ID_CURR',
       'WALLSMATERIAL_MODE_Stone, brick',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_FAMILY_STATUS_Married', 'NAME_FAMILY_STATUS_Separated',
       'NAME_FAMILY_STATUS_Single / not married',
       'NAME_FAMILY_STATUS_Widow', 'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_INCOME_TYPE_State servant',
       'NAME_HOUSING_TYPE_Office apartment',
       'NAME_HOUSING_TYPE_Rented apartment',
       'NAME_HOUSING_TYPE_With parents', 'OCCUPATION_TYPE_Accountants',
       'OCCUPATION_TYPE_Cleaning staff', 'OCCUPATION_TYPE_Cooking staff',
       'OCCUPATION_TYPE_Core staff', 'OCCUPATION_TYPE_Drivers',
       'OCCUPATION_TYPE_HR staff',
       'OCCUPATION_TYPE_High skill tech staff',
       'OCCUPATION_TYPE_IT staff', 'OCCUPATION_TYPE_Laborers',
       'NAME_INCOME_TYPE_Working', 'NAME_INCOME_TYPE_Pensioner',
       'WALLSMATERIAL_MODE_Panel', 'FLAG_PHONE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'DAYS_BIRTH',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_WORK_PHONE',
       'FLAG_CONT_MOBILE', 'FLAG_EMAIL',
       'NAME_INCOME_TYPE_Commercial associate', 'REGION_RATING_CLIENT',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
       'FLAG_TOTAL_DOC_NUM', 'NAME_CONTRACT_TYPE_Cash loans',
       'OCCUPATION_TYPE_Low-skill Laborers', 'OCCUPATION_TYPE_Managers',
       'OCCUPATION_TYPE_Medicine staff', 'ORGANIZATION_TYPE_Other',
       'ORGANIZATION_TYPE_Police', 'ORGANIZATION_TYPE_Postal',
       'ORGANIZATION_TYPE_Religion', 'ORGANIZATION_TYPE_Restaurant',
       'ORGANIZATION_TYPE_School', 'ORGANIZATION_TYPE_Security',
       'ORGANIZATION_TYPE_Security Ministries',
       'ORGANIZATION_TYPE_Self-employed', 'ORGANIZATION_TYPE_Services',
       'ORGANIZATION_TYPE_Telecom', 'ORGANIZATION_TYPE_Trade',
       'ORGANIZATION_TYPE_Transport',
       'OCCUPATION_TYPE_Private service staff',
       'FONDKAPREMONT_MODE_not specified',
       'FONDKAPREMONT_MODE_org spec account',
       'FONDKAPREMONT_MODE_reg oper spec account',
       'HOUSETYPE_MODE_block of flats', 'HOUSETYPE_MODE_specific housing',
       'HOUSETYPE_MODE_terraced house', 'WALLSMATERIAL_MODE_Block',
       'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic',
       'WALLSMATERIAL_MODE_Others', 'ORGANIZATION_TYPE_Military',
       'TARGET', 'ORGANIZATION_TYPE_Legal Services',
       'ORGANIZATION_TYPE_Insurance', 'OCCUPATION_TYPE_Realty agents',
       'OCCUPATION_TYPE_Sales staff', 'OCCUPATION_TYPE_Secretaries',
       'OCCUPATION_TYPE_Security staff',
       'OCCUPATION_TYPE_Waiters/barmen staff',
       'ORGANIZATION_TYPE_Advertising', 'ORGANIZATION_TYPE_Agriculture',
       'ORGANIZATION_TYPE_Bank', 'ORGANIZATION_TYPE_Business Entity',
       'ORGANIZATION_TYPE_Cleaning', 'ORGANIZATION_TYPE_Construction',
       'ORGANIZATION_TYPE_Culture', 'ORGANIZATION_TYPE_Electricity',
       'ORGANIZATION_TYPE_Emergency', 'ORGANIZATION_TYPE_Government',
       'ORGANIZATION_TYPE_Hotel', 'ORGANIZATION_TYPE_Housing',
       'ORGANIZATION_TYPE_Industry', 'ORGANIZATION_TYPE_Medicine',
       'new_app_credit_ratio', 'NEW_ANNUITY_CREDIT_RATIO',
       'NEW_CREDIT_GOODS_RATIO', 'NEW_AMT_INTEREST', 'NEW_INTEREST_RATIO',
       'NEW_AMT_NEEDED_CREDIT_RATIO', 'DAYS_EMPLOYED_PERC',
       'INCOME_CREDIT_PERC', 'INCOME_PER_PERSON', 'ANNUITY_INCOME_PERC']

In [None]:
# create array with new names and add to df
new_columns = np.array(new_columns)
df.columns = new_columns

In [None]:
# we create a dataframe with explanation of all features
HomeCredit_columns_description = pd.read_csv("/kaggle/input/p7-all-files/HomeCredit_columns_description.csv")

explanation = HomeCredit_columns_description.iloc[:, 2:4].T
explanation = explanation.rename(columns=explanation.iloc[0])
explanation = explanation.drop('Row', axis=0)
# drop duplicated explanations 
explanation = explanation.loc[:, ~explanation.columns.duplicated()]

In [None]:
# add the definition for newly created features
explanation['FLAG_TOTAL_DOC_NUM'] = 'Total number of documents presented (document 1 to 21)'
explanation['new_app_credit_ratio'] = 'Credit requested/credit given ratio'
explanation['NEW_ANNUITY_CREDIT_RATIO'] = 'Loan installment/credit amount ratio'
explanation['NEW_CREDIT_GOODS_RATIO'] = 'Credit amount/goods price ratio'
explanation['NEW_AMT_INTEREST'] = 'Interest amount'
explanation['NEW_INTEREST_RATIO'] = 'Interest ratio'
explanation['NEW_AMT_NEEDED_CREDIT_RATIO'] = 'needed amount/credit amount ratio'
explanation['DAYS_EMPLOYED_PERC'] = 'Ratio of working time during lifetime'
explanation['INCOME_CREDIT_PERC'] = 'Income/credit amount ratio'
explanation['INCOME_PER_PERSON'] = 'Income per person in family'
explanation['ANNUITY_INCOME_PERC'] = 'Loan installment/income ratio'
explanation['Credit_Duration'] = 'Duration of the credit (days)'
explanation['PERC_PAYM_AMT'] = 'Percentage of the amount paid in each installment'

In [None]:
# drop the features we deleted in df
explanation.drop(explanation.iloc[:, 96:116], axis=1, inplace=True)
explanation.drop(explanation.iloc[:, 32:34], axis=1, inplace=True)
explanation.drop(explanation.iloc[:, 42:87], axis=1, inplace=True)

In [None]:
# put titles in lower case for readability
explanation.columns = explanation.columns.str.lower()
# arange on alphabetical order
explanation = explanation.sort_index(ascending=True, axis=1)

In [None]:
# we have now 209 features explained 
explanation.shape

In [None]:
# store for futur app
explanation.to_csv('explanations.csv', index=False)

## B - Analysis

In [None]:
# first let's look at the number of people with default payment
plt.style.use('ggplot')
# create df with number of person by class
temp = df["TARGET"].value_counts()
df_class = pd.DataFrame({'Classes': temp.index,
                   'Values': temp.values
                  })
plt.figure(figsize = (6,6))
plt.title('Application loans repayed')
# plot for classes
sns.barplot(x = 'Classes', y="Values", data=df_class)
locs, labels = plt.xticks(ticks=[0,1],
                          labels=["Others", "Client with payment difficulties"])
plt.show()

In [None]:
# plot a pie now for imbalance class
labels = 'Others', 'Clients with payment difficulties'
plt.pie(df_class['Values'], labels=labels, autopct='%1.1f%%', explode = [0,0.3],
       shadow=True, startangle=15)
plt.title('Pie with percentage of default payment', loc='center')
plt.show()

print('Number of person with payment difficulties : {}'.format(
    df_class.iloc[1,1]))

In [None]:
"""We have a problem of imbalance classes, only 8% represent our target 
that our model would have to predict"""

In [None]:
# we now look at the 10 most correlated features to the target
corr_10 = df.corr().nlargest(10, 'TARGET').index
corr_10

In [None]:
# plot heatmap for most correlated features
corrmat = np.corrcoef(df[corr_10].values.T)
sns.set(font_scale=2.0)
plt.figure(figsize=(25, 25))
sns.heatmap(corrmat, cbar=True, annot=True, square=True, fmt='.2f', linewidths=2.5,
            annot_kws={'size': 21}, xticklabels=corr_10.values)
plt.show()

### Existing features

In [None]:
# Let's now look at the distribution between gender
sns.barplot(x = 'CODE_GENDER', y='TARGET', data=df)
plt.xlabel('Gender')
plt.xticks([0, 1], ['Women', 'Men'], rotation=20)
plt.show()
# There are more men with default payment

In [None]:
# look at the categorical variable : education_type
# groupby df by target
df_education = df.groupby('TARGET')['NAME_EDUCATION_TYPE_Incomplete higher',
                     'NAME_EDUCATION_TYPE_Higher education',
                     'NAME_EDUCATION_TYPE_Academic degree',
                     'NAME_EDUCATION_TYPE_Lower secondary',
                    'NAME_EDUCATION_TYPE_Secondary / secondary special'
                                     ].sum().sort_values(by=1,
                                                         axis=1,
                                                         ascending=False)

In [None]:
# plot % of default payment based on education
plt.barh(df_education.columns.values, 100*(df_education.iloc[1,:]/df_education.iloc[0,:]).sort_values())
plt.title('Percentage of client with payment difficulties based on education type')
plt.yticks(ticks=range(5), labels=['Academic degree',
                                   'Higher education',
                                   'Incomplete higher',
                                   'Secondary',
                                   'Lower secondary'])
plt.xlabel('%')
plt.show()

In [None]:
# let's look at number of days past due by loan
sns.displot(df['DAYS_PAST_DUE'])
plt.xlim(-500, 2500)
plt.ylim(0,7500)
plt.show()

In [None]:
"""Plot of KDE plots when the documents are old, the clients 
has better reimbursment. Clients with newly acquired papers 
seem to default more"""

In [None]:
plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 0, 'DAYS_ID_PUBLISH'],
            label='Payment on time')
# KDE plot of loans which were not repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 1, 'DAYS_ID_PUBLISH'],
            label='Payment difficulties')

# Labeling of plot
plt.xlabel('How many days before the application did client\
 change the identity document with which he applied for the loan')
plt.ylabel('Density')
plt.title('Distribution')

plt.legend(shadow=True)
plt.show()

In [None]:
# plot for feature age
sns.set(font_scale=1.0)
plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 0, 'DAYS_BIRTH']/365,
            label = 'Payment on time', fill=True, alpha=0.5, color='#022282')
# KDE plot of loans which were not repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 1, 'DAYS_BIRTH']/365,
            label = 'Payment difficulties', fill=True, alpha=0.3, color='#FF511A')
# Labeling of plot
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Distribution of Ages')
plt.legend(shadow=True)
plt.show()

In [None]:
"""We see that above 40 years old, clients are significantly
less inclined to default payment"""

### New features

In [None]:
"""Some new feature show different patterns between client with
default payments and others.
For annuity income percentage, above a 0.16 ratio, clients are more proned 
to have payment difficulties.
For days employed percentage, above a 0.14 ratio, clients are less proned
to have payment difficulties.
No useful information can be read from credit_duration plot"""

In [None]:
# CREDIT_DURATION feature
plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 0, 'Credit_Duration'], label='Payment on time',
            fill=True, alpha=0.5, color='#022282')

# KDE plot of loans which were not repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 1, 'Credit_Duration'], label='Payment difficulties',
           fill=True, alpha=0.3, color='#FF511A')

# Labeling of plot
plt.xlabel('Duration (days)')
plt.ylabel('Density')
plt.title('Distributionof of credit term')

plt.legend(shadow=True)
plt.show()

In [None]:
# ANNUITY INCOME PERCENTAGE feature
plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 0, 'ANNUITY_INCOME_PERC'], label='Payment on time',
            fill=True, alpha=0.5, color='#022282')

# KDE plot of loans which were not repaid on time
sns.kdeplot(df.loc[df['TARGET'] == 1, 'ANNUITY_INCOME_PERC'], label='Payment difficulties',
           fill=True, alpha=0.3, color='#FF511A')

# Labeling of plot
plt.xlabel('Loan annuity/income ratio')
plt.ylabel('Density')
plt.title('Distributionof ratio loan installment/total income')
plt.xlim(-0.05, 0.4)

plt.axvline(x=0.159, color='k', linestyle='--', label='Threshold r=0.16')
plt.legend(shadow=True)
plt.show()

In [None]:
# For DAYS EMPLOYED PERCENTAGE feature
fig = plt.figure(figsize = (10, 8))
# KDE plot of loans that were repaid on time
sns.kdeplot(df.loc[df['TARGET']==0, 'DAYS_EMPLOYED_PERC'], label='Payment on time',
            fill=True, alpha=0.5, color='#022282')

# KDE plot of loans which were not repaid on time
sns.kdeplot(df.loc[df['TARGET']==1, 'DAYS_EMPLOYED_PERC'], label='Payment difficulties',
           fill=True, alpha=0.3, color='#FF511A')

# Labeling of plot
plt.xlabel('Ratio of working time during lifetime')
plt.ylabel('Density')
plt.title('Distribution of ratio working time/living time')
plt.xlim(-0.02, 0.7)
#set a threshold for default payment
plt.axvline(x=0.14, color='k', linestyle='--', label='Threshold r=0.14')

plt.legend(shadow=True)
plt.show()

# V - Classification

## A - Upsampling

In [None]:
"""For the imbalance class problem we try several methods including upsampling and downsampling.
The first one is shown below but rejected because of poor results (predictions (macro_avg) around 55%)"""

In [None]:
# smote : plus proches voisins

In [None]:
X = df.loc[:, df.columns != 'TARGET']
y = df['TARGET']

# On divise le df en train et test sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
# concatenate our training data back together
X = pd.concat([X_train1, y_train1], axis=1)
# separate minority and majority classes
not_default = X[X.TARGET==0]
default = X[X.TARGET==1]
# upsample minority
default_upsampled = resample(default,
                          replace=True, # sample with replacement
                          n_samples=len(not_default), # match number in majority class
                          random_state=42) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_default, default_upsampled])
# new X and y
X_upsampled = upsampled.loc[:, upsampled.columns != 'TARGET']
y_upsampled = upsampled['TARGET']
# we do a new split with new sample
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.3, random_state=42)
# fill null values after split
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

#Standardization avec RobustScaler
rb_sc = RobustScaler()
X_train_scaled = rb_sc.fit_transform(X_train)
X_test_scaled = rb_sc.fit_transform(X_test)
# we do a final splot for validation set
X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_train_scaled, y_train, train_size=.8, random_state=0)

In [None]:
print(X_train_scaled.shape)
print(X_val_scaled.shape)
print(y_train.shape)
print(y_val.shape)

## B - Undersampling

In [None]:
X = df.loc[:, df.columns != 'TARGET']
y = df['TARGET']
# Undersampling with imblearn
rUs = RandomUnderSampler()
X_ru, y_ru = rUs.fit_resample(X, y)
# Split the sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_ru, y_ru, test_size=0.3, random_state=42)

### Univariate imputation

In [None]:
"""We fill the null values after the split to have more independant values"""

In [None]:
# we use the new names for columns
X_train.columns = new_columns[new_columns != 'TARGET']
X_test.columns = new_columns[new_columns != 'TARGET']

In [None]:
# fillna
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)
# new split for validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=.8, random_state=0)

In [None]:
# save X_test for futur app
X_test.to_csv('X_test.csv')

### Multivariate imputation

In [None]:
# use of iterative imputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(X_train)
X_train = imp.transform(X_train)
X_test = imp.transform(X_test)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=.8, random_state=0)

## C - Xgboost model

In [None]:
"""Use of XGBoost from high performance, SHAP compatibility, and for the hyperparameter scale_pos_weight made for imbalanced datasets. 
The use of this hyperparameter didn't yield good results so i don't use it in the final model.
It was interesting to have a good recall, but the precision was too low (0.14)"""

In [None]:
# parameters for xgb model
xgb_model = xgboost.XGBClassifier(random_state=0,
                                  eta=0.1,
                                  max_depth=10,
                                  max_leaves=2,
                                  n_estimators=100,
                                  reg_alpha=0.65,
                                  reg_lambda=0.1,
                                  subsample=0.8,
                                 gamma=10,
                                 min_child_weight=1)
#,scale_pos_weight=100 

### Grid search for hyperparameter tuning

In [None]:
"""We determine first the number of estimators and the learning rate, then a few selected hyperparameters 
with a cross validation of 5."""

In [None]:
# first param: n_estimators
param_test1 = {'n_estimators':[5, 100, 500, 1000, 5000]}

# Grid search 1
gsearch1 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                  eta=0.5,
                                  max_depth=5,
                                  max_leaves=5,
                                  gamma=1,
                                  reg_alpha=1,
                                  reg_lambda=1,
                                  subsample=1,
                                 min_child_weight=1),
                        param_grid = param_test1, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch1 = gsearch1.fit(X_train ,y_train)

# print best params and score
gsearch1.best_params_, gsearch1.best_score_

In [None]:
# Param: learning_rate
param_test2 = {'eta':[0.0001, 0.001, 0.01, 0.1, 0.3]}

# Grid search 2 
gsearch2 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                  gamma=1,
                                  max_depth=5,
                                  max_leaves=5,
                                  n_estimators=100,
                                  reg_alpha=1,
                                  reg_lambda=1,
                                  subsample=1,
                                 min_child_weight=1),
                        param_grid = param_test2, scoring='f1_macro',
                        n_jobs=-1, cv=5)

# fit 
gsearch2 = gsearch2.fit(X_train ,y_train)

# print best params and score
gsearch2.best_params_, gsearch2.best_score_

In [None]:
# param: gamma
param_test3 = {'gamma':range(5, 15, 1)}

# Grid search 3
gsearch3 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                  eta=0.1,
                                  max_depth=5,
                                  max_leaves=5,
                                  n_estimators=100,
                                  reg_alpha=1,
                                  reg_lambda=1,
                                  subsample=1,
                                 min_child_weight=1),
                        param_grid = param_test3, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch3 = gsearch1.fit(X_train ,y_train)

# print best params and score
gsearch3.best_params_, gsearch3.best_score_

In [None]:
# Params: reg_lambda
param_test4 = {'reg_lambda': np.arange(0.05, 0.95, 0.05)}

# Grid search 4
gsearch4 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                                          eta=0.1,
                                                          max_depth=5,
                                                          max_leaves=5,
                                                          gamma=10,
                                                          reg_alpha=1,
                                                          subsample=1,
                                                          min_child_weight=1,
                                                          n_estimators=100),
                        param_grid = param_test4, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch4 = gsearch4.fit(X_train ,y_train)

# print best params and score
gsearch4.best_params_, gsearch4.best_score_

In [None]:
# Params: max depth
param_test5 = {'max_depth':range(5, 50, 3)}

# Grid search 5
gsearch5 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                                          eta=0.1,
                                                          reg_lambda=0.1,
                                                          max_leaves=5,
                                                          gamma=10,
                                                          reg_alpha=1,
                                                          subsample=1,
                                                          min_child_weight=1,
                                                          n_estimators=100),
                        param_grid = param_test5, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch5 = gsearch5.fit(X_train ,y_train)

# print best params and score
gsearch5.best_params_, gsearch5.best_score_

In [None]:

# Params: max leaves
param_test6 = {'max_leaves':range(2, 10, 1)}

# Grid search 6
gsearch6 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                                          eta=0.1,
                                                          reg_lambda=0.1,
                                                          max_depth=10,
                                                          gamma=10,
                                                          reg_alpha=1,
                                                          subsample=1,
                                                          min_child_weight=1,
                                                          n_estimators=100),
                        param_grid = param_test6, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch6 = gsearch6.fit(X_train ,y_train)

# print best params and score
gsearch6.best_params_, gsearch6.best_score_

In [None]:
# Params: subsample
param_test7 = {'subsample':np.arange(0.1, 1, 0.1)}

# Grid search 7
gsearch7 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                                          eta=0.1,
                                                          reg_lambda=0.1,
                                                          max_depth=10,
                                                          gamma=10,
                                                          reg_alpha=1,
                                                          max_leaves=2,
                                                          min_child_weight=1,
                                                          n_estimators=100),
                        param_grid = param_test7, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch7 = gsearch7.fit(X_train ,y_train)

# print best params and score
gsearch7.best_params_, gsearch7.best_score_

In [None]:
# Params: min_child_weight
param_test8 = {'min_child_weight':range(1, 5, 1)}

# Grid search 8
gsearch8 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                                          eta=0.1,
                                                          reg_lambda=0.1,
                                                          max_depth=10,
                                                          gamma=10,
                                                          reg_alpha=0.65,
                                                          max_leaves=2,
                                                          subsample=0.8,
                                                          n_estimators=100),
                        param_grid = param_test8, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch8 = gsearch8.fit(X_train ,y_train)

# print best params and score
gsearch8.best_params_, gsearch8.best_score_

In [None]:
# Params: reg_alpha
param_test9 = {'reg_alpha': np.arange(0.05, 0.95, 0.05)}

# Grid search 9
gsearch9 = GridSearchCV(estimator = xgboost.XGBClassifier(random_state=0,
                                                          eta=0.1,
                                                          max_depth=5,
                                                          max_leaves=5,
                                                          gamma=10,
                                                          reg_lambda=0.1,
                                                          subsample=1,
                                                          min_child_weight=1,
                                                          n_estimators=100),
                        param_grid = param_test9, scoring='f1_macro',n_jobs=-1, cv=5)

# fit 
gsearch9 = gsearch9.fit(X_train ,y_train)

# print best params and score
gsearch9.best_params_, gsearch9.best_score_

### Model predictions

In [None]:
# fit the model with parameter 
start_time = time.time()
xgb_model = xgb_model.fit(X_train, y_train,
                          eval_set=[(X_train, y_train), (X_val, y_val)],
                          early_stopping_rounds=10,   
                          verbose=True,
                          eval_metric='aucpr') #test with aucpr too

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# predict values for testing set
y_pred_xgb = xgb_model.predict(X_test)
proba = xgb_model.predict_proba(X_test)

In [None]:
pickle.dump(xgb_model, open('final_model.pickle', 'wb'))

In [None]:
# Specify the target classes
classes = ["Payment on time", "Default payment"]

# Instantiate the classification model and visualizer
visualizer = ClassificationReport(xgb_model, classes=classes, support=True)
# Fit the visualizer and the model
visualizer.fit(X_train, y_train) 
# Evaluate the model on the test data
visualizer.score(X_test, y_test)        
visualizer.show()  

In [None]:
#On regarde les features les plus importants 
coef = pd.Series(xgb_model.feature_importances_,
                 X_train.columns).sort_values(ascending=False)

In [None]:
# plot the most important features 
plt.style.use('ggplot')
#barplot
coef[:20].plot(kind='bar')
#labels
plt.title('Feature importance du modèle', fontsize = 15)

#mise en évidence du nutriscore
plt.gca().get_xticklabels()[0].set_color('#547894') 
plt.gca().get_xticklabels()[1].set_color('#547894') 
plt.gca().get_xticklabels()[1].set_fontsize(13) 
plt.gca().get_xticklabels()[1].set_fontweight('bold')
plt.gca().get_xticklabels()[0].set_fontsize(13) 
plt.gca().get_xticklabels()[0].set_fontweight('bold')
#plt.savefig('Feature Importance',dpi = 300,bbox_inches='tight')

plt.show()

# VI - Scoring

In [None]:
"""We use shap to predict a score for each client thant can be explained
with weights to each feature"""

In [None]:
# Use of shap tree explainer 
start_time = time.time()

explainer = shap.TreeExplainer(xgb_model,
                               feature_names=X_test.columns.str.lower().values)
shap_values = explainer.shap_values(X_test, y=y_test)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# we also use simple explainer
shap_explainer = shap.Explainer(xgb_model, feature_names=X_test.columns.values)
shap_explainer_values = shap_explainer(X_test, y=y_test)

In [None]:
# The treatment takes time, we store de dictionaries with pickle
with open(r'C:\Users\Antoine\Projet 7\shap_explainer.pickle', 'wb') as handle:
    pickle.dump(shap_explainer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(r'C:\Users\Antoine\Projet 7\shap_explainer_values.pickle', 'wb') as handle:
    pickle.dump(shap_explainer_values, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# we use the score based on prediction 
score = explainer.model.predict(X_test)

In [None]:
# We save the score values for each client
score = pd.DataFrame({'score X_test': score}, index=X_test.index)

In [None]:
# Normalize score for probability
min_max_scaler = preprocessing.MinMaxScaler()
norm_score = min_max_scaler.fit_transform(score)

In [None]:
# store to csv
norm_score.to_csv('normalized_scoring.csv', index=False)
score.to_csv('scoring.csv', index=False)

In [None]:
# look at summary plot
shap.summary_plot(shap_values, X_test)

In [None]:
# an overall plot for feature importance on all the datasets
shap.plots.bar(shap_explainer_values)

In [None]:
"""Unfortunately, we don't know what the two top features are.
We can see for other features the importance they have in predictions."""

In [None]:
# we can see the importance of each value for one client
# For the second observation for example
shap.plots.waterfall(shap_explainer_values[1]) 

In [None]:
# still for the same client :
shap.initjs()
shap.plots.force(explainer.expected_value, shap_values[1])

In [None]:
"""When the shap score is negative, this is a prediction for payment.
When it's positive, the model predicts a default in payment"""