In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
   
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr


# Training Data

In [2]:
import feather
train_data = pd.read_feather('/kaggle/input/amexfeather/train_data.ftr')
train_dataset = train_data.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [3]:
del train_data
gc.collect()

23

In [4]:
train_dataset.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2018-03-13,0.93457,0.009117,0.009384,1.007812,0.006104,0.13501,0.001604,0.007175,,...,,,0.007187,0.004234,0.005085,,0.00581,0.00297,0.00853,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,2018-03-25,0.880371,0.178101,0.034698,1.003906,0.006912,0.165527,0.00555,0.00507,,...,,,0.002981,0.007481,0.007874,,0.003284,0.00317,0.008514,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,2018-03-12,0.880859,0.009705,0.004284,0.8125,0.006451,,0.003796,0.007195,,...,,,0.007381,0.006622,0.000965,,0.002201,0.000834,0.003445,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,2018-03-29,0.621582,0.001082,0.012566,1.005859,0.007828,0.287842,0.004532,0.009941,,...,,,0.002705,0.006184,0.001899,,0.008186,0.005558,0.002983,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,2018-03-30,0.87207,0.005573,0.007679,0.815918,0.001247,,0.000231,0.005527,,...,,,0.002974,0.004162,0.005764,,0.008156,0.006943,0.000905,0


# FEATURE ENGINEERING 

## * Check Null values

In [5]:
NaN_Val = np.array(train_dataset.isnull().sum())
NaN_prec = np.array((train_dataset.isnull().sum() * 100 / len(train_dataset)).round(2))
NaN_Col = pd.DataFrame([np.array(list(train_dataset.columns)).T,NaN_Val.T,NaN_prec.T,np.array(list(train_dataset.dtypes)).T], index=['Features','Num of Missing values','Percentage','DataType']
).transpose()
pd.set_option('display.max_rows', None)
NaN_Col

Unnamed: 0,Features,Num of Missing values,Percentage,DataType
0,S_2,0,0.0,datetime64[ns]
1,P_2,2969,0.65,float16
2,D_39,0,0.0,float16
3,B_1,0,0.0,float16
4,B_2,31,0.01,float16
5,R_1,0,0.0,float16
6,S_3,84970,18.52,float16
7,D_41,31,0.01,float16
8,B_3,31,0.01,float16
9,D_42,399003,86.95,float16


## * Drop columns having high missing rate

In [6]:
train_dataset = train_dataset.drop(['S_2','D_42','D_49','D_66','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142'],axis=1)

In [7]:
col = train_dataset.columns
train_dataset[col].duplicated().sum()

0

In [8]:
train_dataset.describe()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_43,D_44,...,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145,target
count,455944.0,458913.0,458913.0,458882.0,458913.0,373943.0,458882.0,458882.0,324591.0,436618.0,...,456083.0,458913.0,458913.0,456083.0,458913.0,456083.0,456083.0,458913.0,456083.0,458913.0
mean,,,0.0,,0.0,,0.0,,0.0,0.0,...,0.0,0.0,0.0,,0.0,,,0.0,0.0,0.258934
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43805
min,-0.458984,5.960464e-08,-3.570312,1.788139e-07,5.960464e-08,-0.508789,5.960464e-08,5.960464e-08,2e-06,0.0,...,0.0,0.0,5.960464e-08,0.0,0.0,1.192093e-07,0.0,0.0,0.0,0.0
25%,0.449219,0.00478363,0.008987,0.07092285,0.00295639,0.127563,0.003007889,0.005485535,0.043335,0.003994,...,0.002823,0.002754,0.002504349,0.003046,0.002552,0.003026962,0.003033,0.002758,0.003031,0.0
50%,0.682617,0.009559631,0.033234,0.8134766,0.00592041,0.164917,0.006011963,0.01083374,0.092407,0.008034,...,0.005646,0.00552,0.004997253,0.006077,0.005119,0.00605011,0.006065,0.005501,0.006073,0.0
75%,0.862305,0.2666016,0.153687,1.001953,0.008888245,0.264893,0.008995056,0.2106934,0.195068,0.134155,...,0.008461,0.008263,0.007499695,0.009102,0.007675,0.009101868,0.009094,0.008255,0.009102,1.0
max,1.009766,5.0,1.324219,1.009766,2.759766,3.994141,8.992188,1.59668,9.1875,4.003906,...,1.491211,1.509766,1.009766,1.009766,1.009766,1.339844,1.009766,1.34375,4.828125,1.0


## * Fill null values with median in numerical features

In [9]:
selected_col = np.array(['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145'])

for col in selected_col:
    train_dataset[col] = train_dataset[col].fillna(train_dataset[col].median())

## * Filling missing values with mode in categorical features

In [10]:
selcted_col2 = np.array(['D_68','B_30','B_38','D_64','D_114','D_116','D_117','D_120','D_126'])

for col2 in selcted_col2:
    train_dataset[col2] =  train_dataset[col2].fillna(train_dataset[col2].mode()[0])

In [11]:
print(train_dataset.isnull().sum().to_string())

P_2       0
D_39      0
B_1       0
B_2       0
R_1       0
S_3       0
D_41      0
B_3       0
D_43      0
D_44      0
B_4       0
D_45      0
B_5       0
R_2       0
D_46      0
D_47      0
D_48      0
B_6       0
B_7       0
B_8       0
D_50      0
D_51      0
B_9       0
R_3       0
D_52      0
P_3       0
B_10      0
D_53      0
S_5       0
B_11      0
S_6       0
D_54      0
R_4       0
S_7       0
B_12      0
S_8       0
D_55      0
D_56      0
B_13      0
R_5       0
D_58      0
S_9       0
B_14      0
D_59      0
D_60      0
D_61      0
B_15      0
S_11      0
D_62      0
D_63      0
D_64      0
D_65      0
B_16      0
B_17      0
B_18      0
B_19      0
B_20      0
D_68      0
S_12      0
R_6       0
S_13      0
B_21      0
D_69      0
B_22      0
D_70      0
D_71      0
D_72      0
S_15      0
B_23      0
P_4       0
D_74      0
D_75      0
B_24      0
R_7       0
D_77      0
B_25      0
B_26      0
D_78      0
D_79      0
R_8       0
S_16      0
D_80      0
R_10      0
R_11

In [12]:
train_dataset.shape

(458913, 166)

## Do similar steps for Testing Data

In [13]:
test_dataset_ = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')
# Keep the latest statement features for each customer
test_dataset = test_dataset_.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [14]:
del test_dataset_
gc.collect()

23

In [15]:
test_dataset.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,2019-10-12,0.568848,0.121399,0.01078,1.009766,0.006924,0.149414,0.000396,0.003576,0.10376,...,,,,0.005913,0.00125,0.006542,,0.009163,0.003691,0.00322
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,2019-04-15,0.841309,0.126465,0.016556,1.008789,0.009712,0.112183,0.006191,0.011383,,...,,,,0.004345,0.000866,0.009117,,0.002197,0.000247,0.007778
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,2019-10-16,0.697754,0.002724,0.001485,0.810059,0.002621,0.166138,0.004887,0.015945,,...,,,,1.000977,0.008896,0.895996,0.150146,1.009766,0.457764,0.092041
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,2019-04-22,0.513184,0.324707,0.149536,0.205688,0.002277,0.181152,0.005814,0.498535,,...,,,,1.007812,0.003754,0.919922,0.255371,1.007812,0.500977,0.182983
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,2019-10-22,0.254395,0.768066,0.563477,0.038025,0.50293,0.168335,0.009483,0.831055,,...,,,,0.006622,0.00114,0.009529,,0.009407,0.001557,0.000525


In [16]:
NaN_Val2 = np.array(test_dataset.isnull().sum())
NaN_prec2 = np.array((test_dataset.isnull().sum() * 100 / len(test_dataset)).round(2))
NaN_Col2 = pd.DataFrame([np.array(list(test_dataset.columns)).T,NaN_Val2.T,NaN_prec2.T,np.array(list(test_dataset.dtypes)).T], index=['Features','Num of Missing values','Percentage','DataType']
).transpose()
pd.set_option('display.max_rows', None)

NaN_Col2

Unnamed: 0,Features,Num of Missing values,Percentage,DataType
0,S_2,0,0.0,datetime64[ns]
1,P_2,4784,0.52,float16
2,D_39,0,0.0,float16
3,B_1,0,0.0,float16
4,B_2,43,0.0,float16
5,R_1,0,0.0,float16
6,S_3,144280,15.6,float16
7,D_41,43,0.0,float16
8,B_3,43,0.0,float16
9,D_42,827578,89.5,float16


In [17]:
test_dataset = test_dataset.drop(['S_2','D_42','D_49','D_66','D_73','D_76','R_9','B_29','D_87','D_88','D_106','R_26','D_108','D_110','D_111','B_39','B_42','D_132','D_134','D_135','D_136','D_137','D_138','D_142'], axis=1)

In [18]:
selected_column = np.array(['P_2','S_3','B_2','D_41','D_43','B_3','D_44','D_45','D_46','D_48','D_50','D_53','S_7','D_56','S_9','S_12','S_17','B_6','B_8','D_52','P_3','D_54','D_55','B_13','D_59','D_61','B_15','D_62','B_16','B_17','D_77','B_19','B_20','D_69','B_22','D_70','D_72','D_74','R_7','B_25','B_26','D_78','D_79','D_80','B_27','D_81','R_12','D_82','D_105','S_27','D_83','R_14','D_84','D_86','R_20','B_33','D_89','D_91','S_22','S_23','S_24','S_25','S_26','D_102','D_103','D_104','D_107','B_37','R_27','D_109','D_112','B_40','D_113','D_115','D_118','D_119','D_121','D_122','D_123','D_124','D_125','D_128','D_129','B_41','D_130','D_131','D_133','D_139','D_140','D_141','D_143','D_144','D_145'])

for column in selected_column:
    test_dataset[column] = test_dataset[column].fillna(test_dataset[column].median())

In [19]:
selected_column2 = np.array(['D_68','B_30','B_38','D_114','D_116','D_117','D_120','D_126'])

for column2 in selected_column2:
    test_dataset[column2] =  test_dataset[column2].fillna(test_dataset[column2].mode()[0])

In [20]:
print(test_dataset.isnull().sum().to_string())

P_2      0
D_39     0
B_1      0
B_2      0
R_1      0
S_3      0
D_41     0
B_3      0
D_43     0
D_44     0
B_4      0
D_45     0
B_5      0
R_2      0
D_46     0
D_47     0
D_48     0
B_6      0
B_7      0
B_8      0
D_50     0
D_51     0
B_9      0
R_3      0
D_52     0
P_3      0
B_10     0
D_53     0
S_5      0
B_11     0
S_6      0
D_54     0
R_4      0
S_7      0
B_12     0
S_8      0
D_55     0
D_56     0
B_13     0
R_5      0
D_58     0
S_9      0
B_14     0
D_59     0
D_60     0
D_61     0
B_15     0
S_11     0
D_62     0
D_63     0
D_64     0
D_65     0
B_16     0
B_17     0
B_18     0
B_19     0
B_20     0
D_68     0
S_12     0
R_6      0
S_13     0
B_21     0
D_69     0
B_22     0
D_70     0
D_71     0
D_72     0
S_15     0
B_23     0
P_4      0
D_74     0
D_75     0
B_24     0
R_7      0
D_77     0
B_25     0
B_26     0
D_78     0
D_79     0
R_8      0
S_16     0
D_80     0
R_10     0
R_11     0
B_27     0
D_81     0
D_82     0
S_17     0
R_12     0
B_28     0
R_13     0

In [21]:
test_dataset.shape

(924621, 165)

# Convert Categorical variables to numerical

In [22]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']

train_dataset[categorical_cols] = enc.fit_transform(train_dataset[categorical_cols])
test_dataset[categorical_cols] = enc.transform(test_dataset[categorical_cols])

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder

# categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
# train_dataset = ct.fit_transform(train_dataset)
# test_dataset = ct.fit_transform(test_dataset)

# Remove highly corelated ones

In [23]:
train_dataset_without_target = train_dataset.drop(["target"],axis=1)

cor_matrix = train_dataset_without_target.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
            col_core.add(col_name)
col_core

{'B_11',
 'B_13',
 'B_15',
 'B_23',
 'B_33',
 'B_37',
 'D_104',
 'D_119',
 'D_141',
 'D_143',
 'D_74',
 'D_75',
 'D_77',
 'S_24',
 'S_7'}

In [24]:
train_dataset = train_dataset.drop(col_core, axis=1)
test_dataset = test_dataset.drop(col_core, axis=1)

In [25]:
train_dataset.shape

(458913, 151)

In [26]:
test_dataset.shape

(924621, 150)

In [27]:
num_columns = [col for col in train_dataset.columns if col not in ["target"]]

X = train_dataset[num_columns]
y = train_dataset['target']

print(f"X shape is = {X.shape}" )
print(f"Y shape is = {y.shape}" )

X shape is = (458913, 150)
Y shape is = (458913,)


# Split dataset train and validation 

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape is = {x_train.shape}" )
print(f"Y_train shape is = {y_train.shape}" )
print(f"X_test shape is = {x_test.shape}" )
print(f"Y_test shape is = {y_test.shape}" )

X_train shape is = (367130, 150)
Y_train shape is = (367130,)
X_test shape is = (91783, 150)
Y_test shape is = (91783,)


# KNN

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
# classifier.fit(x_train, y_train)

# SVM

In [None]:
# from sklearn.svm import SVC
# classifier = SVC(kernel = 'rbf', random_state = 0, probability=True)
# classifier.fit(x_train, y_train)

In [None]:
# from xgboost import XGBClassifier
# params = {
#             'objective': 'binary:logistic', 
#             'tree_method': 'gpu_hist', 
#             'max_depth': 7,
#             'subsample':0.88,
#             'colsample_bytree': 0.5,
#             'gamma':1.5,
#             'min_child_weight':8,
#             'lambda':70,
#             'eta':0.03,
# #             'scale_pos_weight': scale_pos_weight,
#     }
# classifier = XGBClassifier(params,num_boost_round=2600,)
# classifier.fit(x_train, y_train)

# [[63547  4470]
#  [ 4792 18974]]
# 0.8990880664175283

In [None]:
from xgboost import XGBClassifier
classifier= XGBClassifier(learning_rate =0.03, n_estimators=5000, max_depth=5, min_child_weight=3, gamma=0.1, subsample=0.9, colsample_bytree=0.45,
                              reg_alpha=0.001, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, reg_lambda = 10, seed=27)

In [None]:
classifier.fit(x_train, y_train, early_stopping_rounds=100, eval_set = [(x_test, y_test)], eval_metric = 'auc', verbose = 200)

In [None]:
# import lightgbm as lgb

# d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categorical_cols)

# params = {'objective': 'binary','n_estimators': 1200,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 511,'seed': 42,'verbose': -1}

# # trained model with 100 iterations
# classifier = lgb.train(params, d_train, 100)

In [33]:
# from sklearn.ensemble import RandomForestClassifier

# classifier = RandomForestClassifier(n_estimators=5, max_depth=2, random_state=0)
# classifier.fit(x_train, y_train)

RandomForestClassifier(max_depth=2, n_estimators=5, random_state=0)

In [34]:
predictions = classifier.predict(x_test)
predictions

array([1, 0, 0, ..., 0, 0, 0])

In [35]:
testing = classifier.predict_proba(x_test)
testing

array([[0.48533984, 0.51466016],
       [0.82910746, 0.17089254],
       [0.94512735, 0.05487265],
       ...,
       [0.94512735, 0.05487265],
       [0.94512735, 0.05487265],
       [0.55945064, 0.44054936]])

In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, predictions)
print(cm)
accuracy_score(y_test, predictions)

[[63781  4236]
 [ 9033 14733]]


0.8554307442554722

In [37]:
y_pred = classifier.predict_proba(test_dataset[num_columns])
y_pred

array([[0.88932158, 0.11067842],
       [0.94512735, 0.05487265],
       [0.86124708, 0.13875292],
       ...,
       [0.49920214, 0.50079786],
       [0.74943835, 0.25056165],
       [0.83901991, 0.16098009]])

In [None]:
y_pred[0,1]

# K-fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))


# Output

In [38]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': y_pred[:,1]})
output.to_csv('submission.csv', index=False)

In [None]:
# y_predictions = classifier.predict(test_dataset[num_columns])
# y_predictions