In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/american-express-default-prediction/american-express-prediction.ipynb
/kaggle/input/american-express-default-prediction/submission (2).csv


**Problem Statement:**

* Whether out at a restaurant or buying tickets to a concert, modern life counts on the convenience of a credit card to make daily purchases. It saves us from carrying large amounts of cash and also can advance a full purchase that can be paid over time.
* How do card issuers know we’ll pay back what we charge? That’s a complex problem with many existing solutions—and even more potential improvements, to be explored in this competition.
* Credit default prediction is central to managing risk in a consumer lending business. 
 * Credit default prediction allows lenders to optimize lending decisions, which leads to a better customer experience and sound business economics. 
* Current models exist to help manage risk. But it's possible to create better models that can outperform those currently in use.

* The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. 
 * The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.

In [2]:
# Import the Libraries
import numpy as np
import pandas as pd
import dask.dataframe as dd #this finds all the path name
import matplotlib.pyplot as plt
import seaborn as sns # Data Visualization
from datetime import datetime 
import re

In [3]:
#importing SimpleImputer for handling missing value
from sklearn.impute import SimpleImputer

# importing MissingIndicator for handling missing value
from sklearn.impute import MissingIndicator

# importing StandardScaler for standardization
from sklearn.preprocessing import StandardScaler

# importing OnHotEncoder for encoding categorical variable
from sklearn.preprocessing import OneHotEncoder

# importing for transformation
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

# importing PCA for handling dimensonality reduction
from sklearn.decomposition import PCA

# importing pipeline for chaining model building activities
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as mp

# importing FeatureUnion for combining transformers
from sklearn.pipeline import FeatureUnion

# importing samplers for handling data imbalance
from imblearn.combine import SMOTEENN 
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler 

# importing train_test_split for train and validation split
from sklearn.model_selection import train_test_split

# importing SelectFromModel to select features from model 
from sklearn.feature_selection import SelectFromModel               

In [4]:
# importing classifiers to try with
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# importing metrics required for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# importing RepeatedKFold for cross validation
from sklearn.model_selection import RepeatedKFold
# importing for model evaluation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve

# importing RepeatedStratifiedKFold for model evaluation
from sklearn.model_selection import RepeatedStratifiedKFold

# importing GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from yellowbrick.model_selection import ValidationCurve

**Dataset:**

* **train_data.csv** - training data with multiple statement dates per customer_ID
* **train_labels.csv** - target label for each customer_ID
* **test_data.csv** - corresponding test data; objective is to predict the target label for each customer_ID
* **sample_submission.csv** - a sample submission file in the correct format

---

In [5]:
# calculate file size in KB, MB, GB
def convert_bytes(size):
    """ Convert bytes to KB, or MB or GB"""
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)
        size /= 1024.0

# display CSV file with size
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        csvfile=os.path.join(dirname, filename)
        csvfilesize = os.path.getsize(csvfile)
        filesize = convert_bytes(csvfilesize)
        print(f'{csvfile} size is', filesize, 'bytes')

/kaggle/input/amex-default-prediction/sample_submission.csv size is 59.1 MB bytes
/kaggle/input/amex-default-prediction/train_data.csv size is 15.3 GB bytes
/kaggle/input/amex-default-prediction/test_data.csv size is 31.5 GB bytes
/kaggle/input/amex-default-prediction/train_labels.csv size is 29.3 MB bytes
/kaggle/input/american-express-default-prediction/american-express-prediction.ipynb size is 16.2 KB bytes
/kaggle/input/american-express-default-prediction/submission (2).csv size is 93.5 MB bytes


In [6]:
#Loading the training dataset with 100,000 rows
train_data = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv',nrows = 100000)

# get shape of dataframe
print('Shape of dataset is:', train_data.shape)

# print summary of dataframe
train_data.info()

Shape of dataset is: (100000, 190)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 145.0+ MB


There is a total of 190 variables of which 185 are float 1 variable and 4 are object datatype

In [7]:
# Loading dataset train_labels.csv
train_label_data = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

# get shape of dataframe
print('Shape of dataset is:', train_label_data.shape)

# print summary of dataframe
train_label_data.info()

Shape of dataset is: (458913, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


* There are total 458,913 entries for target label with customer_ID
* There is variable (column) customer_ID which has dtype as object and variable (column) target which has dtype as int64

In [8]:
# Loading dataset test_data.csv
test_data = pd.read_csv('../input/amex-default-prediction/test_data.csv', nrows=100000, index_col='customer_ID')

# get shape of dataframe
print('Shape of dataset is:', test_data.shape)

# print summary of dataframe
# test_df.info(verbose=True)
test_data.info()

Shape of dataset is: (100000, 189)
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7 to 0246c7eb137ed9b08014d66e29caf1772b0512becef11a1eda0948b8b8908576
Columns: 189 entries, S_2 to D_145
dtypes: float64(185), int64(1), object(3)
memory usage: 145.0+ MB


* There are 185 variables(Columns) as dtype float64, 1 variable(Column) as dtype int64 and 4 variables(Columns) as dtype object, same structure as train_data.csv

In [9]:
# Merge of train_data and train_label_data dataframe using key as customer_ID
train_data = pd.merge(train_data, train_label_data, how="inner", on=["customer_ID"])


# get shape of dataframe
print('Shape of dataset is:', train_data.shape)

# print summary of dataframe
train_data.info()

Shape of dataset is: (100000, 191)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Columns: 191 entries, customer_ID to target
dtypes: float64(185), int64(2), object(4)
memory usage: 146.5+ MB


In [10]:
#drop customer_ID and S_2 from train_data dataframe which are not required for model building
train_data.drop(axis=1, columns=['customer_ID','S_2'], inplace=True)

In [11]:
#drop S_2 in test_df dataframe which is not required for model building
test_data.drop(axis=1, columns=['S_2'], inplace=True)

In [12]:
#check if any duplicate row
if (any(train_data.duplicated())):
    print("Yes")
else:
    print("No")

No


In [13]:
#check if any duplicate row in test dataset
if (any(test_data.duplicated())):
    print("Yes")
else:
    print("No")

No


In [14]:
# Check for missing value
if(any(train_data.isna().sum())):
    print("Yes")
else:
    print("No")

Yes


In [15]:
# Check for missing value in test dataset
if(any(test_data.isna().sum())):
    print("Yes")
else:
    print("No")

Yes


In [16]:
#drop variables with missing values >=75% in the train dataframe
i=0
for col in train_data.columns:
    if (train_data[col].isnull().sum()/len(train_data[col])*100) >=75:
        print("Dropping column", col)
        train_data.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in train dataframe", i)

Dropping column D_42
Dropping column D_49
Dropping column D_66
Dropping column D_73
Dropping column D_76
Dropping column R_9
Dropping column B_29
Dropping column D_87
Dropping column D_88
Dropping column D_106
Dropping column R_26
Dropping column D_108
Dropping column D_110
Dropping column D_111
Dropping column B_39
Dropping column B_42
Dropping column D_132
Dropping column D_134
Dropping column D_135
Dropping column D_136
Dropping column D_137
Dropping column D_138
Dropping column D_142
Total number of columns dropped in train dataframe 23


In [17]:
#drop variables with missing values >=75% in the test dataframe
i=0
for col in test_data.columns:
    if (test_data[col].isnull().sum()/len(test_data[col])*100) >=75:
        print("Dropping column", col)
        test_data.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in test dataframe", i)

Dropping column D_42
Dropping column D_49
Dropping column D_66
Dropping column D_73
Dropping column D_76
Dropping column R_9
Dropping column B_29
Dropping column D_87
Dropping column D_88
Dropping column D_106
Dropping column R_26
Dropping column D_108
Dropping column D_110
Dropping column D_111
Dropping column B_39
Dropping column B_42
Dropping column D_132
Dropping column D_134
Dropping column D_135
Dropping column D_136
Dropping column D_137
Dropping column D_138
Dropping column D_142
Total number of columns dropped in test dataframe 23


* with the following features being categorical:

**['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']**

In [18]:
#convert dtype for B and D categorical variable to object for training set
train_data = train_data.astype({"B_30": 'str', "B_38": 'str',"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str'})
#convert dtype for B and D categorical variable to object for test set
test_data = test_data.astype({"B_30": 'str', "B_38": 'str',"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str'})

In [19]:
# separate X and y for further processing
X = train_data.drop(columns='target')
print("Shape of X", X.shape)
y = train_data['target']
print("Shape of y", y.shape)

Shape of X (100000, 165)
Shape of y (100000,)
