In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib
from datetime import datetime
import plotly.express as px
import missingno as msno
import re
from collections import Counter
import plotly.graph_objs as go

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as mp
from sklearn.pipeline import FeatureUnion

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

In [4]:
# importing classifiers to try with
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# importing metrics required for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# importing RepeatedKFold for cross validation
from sklearn.model_selection import RepeatedKFold
# importing for model evaluation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
# importing RepeatedStratifiedKFold for model evaluation
from sklearn.model_selection import RepeatedStratifiedKFold
# importing GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from yellowbrick.model_selection import ValidationCurve

In [5]:
# calculate file size in KB, MB, GB
def convert_bytes(size):
    """ Convert bytes to KB, or MB or GB"""
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)
        size /= 1024.0

# display CSV file with size
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        csvfile=os.path.join(dirname, filename)
        csvfilesize = os.path.getsize(csvfile)
        filesize = convert_bytes(csvfilesize)
        print(f'{csvfile} size is', filesize, 'bytes')

/kaggle/input/amex-default-prediction/sample_submission.csv size is 59.1 MB bytes
/kaggle/input/amex-default-prediction/train_data.csv size is 15.3 GB bytes
/kaggle/input/amex-default-prediction/test_data.csv size is 31.5 GB bytes
/kaggle/input/amex-default-prediction/train_labels.csv size is 29.3 MB bytes


In [6]:
from pathlib import Path
input_path = Path('/kaggle/input/amex-default-prediction/')

In [7]:
train_df_sample = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=100000)

In [8]:
print('Shape of dataset is:', train_df_sample.shape)
train_df_sample.info()

Shape of dataset is: (100000, 190)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 145.0+ MB


In [9]:
train_label_df = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

In [10]:
print('Shape of dataset is:', train_label_df.shape)
train_label_df.info()

Shape of dataset is: (458913, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


In [11]:
test_df = pd.read_csv('../input/amex-default-prediction/test_data.csv', nrows=100000, index_col='customer_ID')

In [12]:
print('Shape of dataset is:', test_df.shape)
test_df.info()

Shape of dataset is: (100000, 189)
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7 to 0246c7eb137ed9b08014d66e29caf1772b0512becef11a1eda0948b8b8908576
Columns: 189 entries, S_2 to D_145
dtypes: float64(185), int64(1), object(3)
memory usage: 145.0+ MB


In [13]:
train_df = pd.merge(train_df_sample, train_label_df, how="inner", on=["customer_ID"])

In [14]:
train_df['target'].value_counts()

0    74613
1    25387
Name: target, dtype: int64

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Columns: 191 entries, customer_ID to target
dtypes: float64(185), int64(2), object(4)
memory usage: 146.5+ MB


In [16]:
train_df.drop(axis=1, columns=['customer_ID','S_2'], inplace=True)

In [17]:
test_df.drop(axis=1, columns=['S_2'], inplace=True)

In [18]:
train_df.duplicated().sum()

0

In [19]:
test_df.duplicated().sum()

0

In [20]:
train_df.isna().sum()

P_2         786
D_39          0
B_1           0
B_2          37
R_1           0
          ...  
D_142     82728
D_143      1813
D_144       733
D_145      1813
target        0
Length: 189, dtype: int64

In [21]:
test_df.isna().sum()

P_2        482
D_39         0
B_1          0
B_2         24
R_1          0
         ...  
D_141      814
D_142    82413
D_143      814
D_144      204
D_145      814
Length: 188, dtype: int64

# Drop Variables with Missing Value (>=75%) in train dataset

In [22]:
i=0
for col in train_df.columns:
    if (train_df[col].isnull().sum()/len(train_df[col])*100) >=75:
        print("Dropping column", col)
        train_df.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in train dataframe", i)

Dropping column D_42
Dropping column D_49
Dropping column D_66
Dropping column D_73
Dropping column D_76
Dropping column R_9
Dropping column B_29
Dropping column D_87
Dropping column D_88
Dropping column D_106
Dropping column R_26
Dropping column D_108
Dropping column D_110
Dropping column D_111
Dropping column B_39
Dropping column B_42
Dropping column D_132
Dropping column D_134
Dropping column D_135
Dropping column D_136
Dropping column D_137
Dropping column D_138
Dropping column D_142
Total number of columns dropped in train dataframe 23


# Drop Variables with Missing Value (>=75%) in test dataset

In [23]:
i=0
for col in test_df.columns:
    if (test_df[col].isnull().sum()/len(test_df[col])*100) >=75:
        print("Dropping column", col)
        test_df.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in test dataframe", i)

Dropping column D_42
Dropping column D_49
Dropping column D_66
Dropping column D_73
Dropping column D_76
Dropping column R_9
Dropping column B_29
Dropping column D_87
Dropping column D_88
Dropping column D_106
Dropping column R_26
Dropping column D_108
Dropping column D_110
Dropping column D_111
Dropping column B_39
Dropping column B_42
Dropping column D_132
Dropping column D_134
Dropping column D_135
Dropping column D_136
Dropping column D_137
Dropping column D_138
Dropping column D_142
Total number of columns dropped in test dataframe 23


In [26]:
train_df.columns

Index(['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_43',
       'D_44',
       ...
       'D_131', 'D_133', 'R_28', 'D_139', 'D_140', 'D_141', 'D_143', 'D_144',
       'D_145', 'target'],
      dtype='object', length=166)

In [27]:
#convert dtype for B categorical variable to object
train_df = train_df.astype({"B_30": 'str', "B_38": 'str'})
test_df = test_df.astype({"B_30": 'str', "B_38": 'str'})

#convert dtype for D categorical variable to object
train_df = train_df.astype({"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str'})
test_df = test_df.astype({"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str'})

In [28]:
X = train_df.drop(columns='target')
y = train_df['target']

In [29]:
print("Shape of X", X.shape)

Shape of X (100000, 165)


In [30]:
print("Shape of y", y.shape)

Shape of y (100000,)


In [31]:
categorical = list(X.select_dtypes('object').columns)
numerical = list(X.select_dtypes('number').columns)

In [32]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ('scaler', StandardScaler())
])

print(cat_pipe)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder',
                 OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('scaler', StandardScaler())])


In [33]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
    ('scaler', StandardScaler())
])
print(num_pipe)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])


In [34]:
preprocess = ColumnTransformer([
    ('cat', cat_pipe, categorical),
    ('num', num_pipe, numerical)
])

print(preprocess)

ColumnTransformer(transformers=[('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False)),
                                                 ('scaler', StandardScaler())]),
                                 ['D_63', 'D_64', 'D_68', 'B_30', 'B_38',
                                  'D_114', 'D_116', 'D_117', 'D_120',
                                  'D_126']),
                                ('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3',
                              

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [36]:
print("Shape of X_train", X_train.shape)

Shape of X_train (75000, 165)


In [37]:
print("Shape of X_test", X_test.shape)

Shape of X_test (25000, 165)


In [38]:
print("Shape of y_train", y_train.shape)

Shape of y_train (75000,)


In [39]:
print("Shape of y_test", y_test.shape)

Shape of y_test (25000,)


In [40]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()
    
    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [42]:
def model_score(model_name):
    print("Training and Evaluation using", model_name)
    print("preprocess - Categorical: Missing value Impute, OneHotEncoding and Scaling")
    print("preprocess - Numerical: Missing value Impute and Scaling")
    model = pipe.fit(X_train, y_train)  
    print (model)
    print("model training score: %.3f" % pipe.score(X_train, y_train))
    print("model validation score: %.3f" % pipe.score(X_test, y_test))
    print("Amex Evaluation Metric - Training: %.3f"% amex_metric(pd.DataFrame(y_train), pd.DataFrame(pipe.predict(X_train), columns=['prediction'])))
    print("Amex Evaluation Metric - Validation: %.3f"% amex_metric(pd.DataFrame(y_test), pd.DataFrame(pipe.predict(X_test), columns=['prediction'])))
    print("#######################################################################")

In [43]:
# function for display of cross validation score
def model_cross_validation_score(model_name):
    print("Training and Evaluation with Cross Validation using",model_name)
    # using scoring with classification metrics
    scoring = ['accuracy', 'precision', 'recall','f1','roc_auc']
    #using RepeatedStratifiedKFold as cross validator
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
    # cross validation returning both train and test score
    scores = cross_validate(pipe, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=True,return_estimator=True)
    print('Training Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['train_accuracy']),np.mean(scores['train_precision']), np.mean(scores['train_recall']), np.mean(scores['train_f1']), np.mean(scores['train_roc_auc'])))
    print('Validation Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['test_accuracy']),np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), np.mean(scores['test_roc_auc'])))
    print("#######################################################################")

In [44]:
# function for display of model score via RandomizedSearchCV
def model_random_search_score(model_name):
    print("Training and Evaluation with RandomizedSearchCV using",model_name)
    random_search.fit(X_train,y_train)
    model = random_search.best_estimator_
    score = random_search.best_score_
    print ("Best Estimator for", model_name,"is", model,"with best score as",score)
    print("Amex Evaluation Metric - Training: %.3f"% amex_metric(pd.DataFrame(y_train), pd.DataFrame(model.predict(X_train), columns=['prediction'])))
    print("Amex Evaluation Metric - Validation: %.3f"% amex_metric(pd.DataFrame(y_test), pd.DataFrame(model.predict(X_test), columns=['prediction'])))
    print("#######################################################################")

In [45]:
steps = [
        ('preprocess', preprocess),
        ('over_sampler',SMOTE(random_state = 42)),
        ('under_sampler',RandomUnderSampler()),
        ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators = 10, random_state = 42, n_jobs = -1))),
        ('dimension_reduction', PCA(n_components='mle',random_state = 42)),
        ('model_estimator', RandomForestClassifier(random_state = 42))
    ]
pipe = Pipeline(steps, verbose=True)

# Random Forest

In [46]:
# using custom function to display model training and validation score
model_score("RandomForestClassifier")

Training and Evaluation using RandomForestClassifier
preprocess - Categorical: Missing value Impute, OneHotEncoding and Scaling
preprocess - Numerical: Missing value Impute and Scaling
[Pipeline] ........ (step 1 of 6) Processing preprocess, total=   1.4s
[Pipeline] ...... (step 2 of 6) Processing over_sampler, total=   8.8s
[Pipeline] ..... (step 3 of 6) Processing under_sampler, total=   0.1s
[Pipeline] . (step 4 of 6) Processing feature_selection, total=  10.4s
[Pipeline]  (step 5 of 6) Processing dimension_reduction, total=   0.5s
[Pipeline] ... (step 6 of 6) Processing model_estimator, total= 2.0min
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                   

# XGBoost

In [None]:
pipe.set_params(model_estimator=XGBClassifier())
model_score("XGBClassifier")

In [None]:
#using custom function to display cross validation score
model_cross_validation_score("XGBClassifier")

# LightGBM

In [None]:
pipe.set_params(model_estimator=LGBMClassifier())
model_score("LGBMClassifier")

In [None]:
model_cross_validation_score("LGBMClassifier")

# CatBoost

In [None]:
pipe.set_params(model_estimator=CatBoostClassifier(iterations=3,learning_rate=1,depth=6))
model_score("CatBoostClassifier")

In [None]:
model_cross_validation_score("CatBoostClassifier")

In [None]:
# using parameters for RandomizedSearchCV for XGBClassifier
param_random = dict(model_estimator=[XGBClassifier()],model_estimator__learning_rate= [0.05,0.10,0.15,0.20,0.25,0.30],model_estimator__max_depth= [ 3, 4, 5, 6, 8, 10, 12, 15],model_estimator__min_child_weight=[ 1, 3, 5, 7 ], model_estimator__gamma=[ 0.0, 0.1, 0.2 , 0.3, 0.4 ], model_estimator__colsample_bytree =[ 0.3, 0.4, 0.5 , 0.7 ])
random_search = RandomizedSearchCV(pipe, param_distributions=param_random, n_iter=1, cv=3, scoring='roc_auc', verbose=3,random_state=42)

In [None]:
#using custom function to display best estimator and score
model_random_search_score("XGBClassifier")

In [None]:
# using parameters for RandomizedSearchCV for LGBMClassifier
param_random = dict(model_estimator=[LGBMClassifier()],model_estimator__num_leaves= [20,40,60,80,100],model_estimator__min_child_samples= [5,10,15],model_estimator__max_depth=[-1,5,10,20], model_estimator__learning_rate=[0.05,0.1,0.2], model_estimator__reg_alpha =[0,0.01,0.03])
random_search = RandomizedSearchCV(pipe, param_distributions=param_random, n_iter=1, cv=3, scoring='roc_auc', verbose=3,random_state=42)

Best Estimator for LGBMClassifier for this training sample

In [None]:
#using custom function to display best estimator and score
model_random_search_score("LGBMClassifier")

In [None]:
test_df_new=test_df.reset_index()

In [None]:
del test_df

In [None]:
X_test_predict = test_df_new.groupby('customer_ID').tail(1)

In [None]:
X_test_predict.shape

In [None]:
X_test_predict.set_index('customer_ID', inplace=True)

In [None]:
model = random_search.best_estimator_

In [None]:
# prediction on test dataset
y_test_pred = model.predict(X_test_predict)

In [None]:
# generate submission file
output = pd.DataFrame({'customer_ID': X_test_predict.index,'prediction': y_test_pred})
output.to_csv('submission.csv', index=False, header=True)