In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [56]:
dataset =  pd.read_csv(r"C:\Users\Duong Nguyen\Desktop\final_project\home-credit-default-risk\results\1_9_combined_dataset.csv")

In [57]:
dataset.columns = list(map(lambda x: str(x).replace(" ", "_").replace("-", "_").replace("_/_", "_").upper(), dataset.columns))
import re
dataset = dataset.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [58]:
#create stratifying field
strat_field = 'AMT_INCOME_TOTAL'

strat_column = strat_field + "_GROUP"

dataset[strat_column] = dataset[strat_field].apply(lambda x:int(  round(x)))
dataset[strat_column] = dataset[strat_column].apply(lambda x: 14 if x==15 else x)
dataset[strat_column].value_counts().sort_index()

10        84
11     64238
12    238024
13     42160
14       104
Name: AMT_INCOME_TOTAL_GROUP, dtype: int64

In [59]:
train_df = dataset[dataset.TARGET.isna() == False]
test_df = dataset[dataset.TARGET.isna() == True]

In [60]:
X_train = train_df.drop(["TARGET", "SK_ID_CURR",strat_field], axis = 1)
X_test = train_df.drop(["TARGET", "SK_ID_CURR",strat_field], axis = 1)
y_train = train_df.TARGET

In [61]:
X_train.dropna(axis='columns',inplace=True)
X_test.dropna(axis='columns',inplace=True)

In [62]:
from sklearn.model_selection import train_test_split
# split into train/test sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=54,stratify=y_train)
print("X training size: {} and y training size {}:".format(X_train.shape,y_train.shape))
print("X validation size: {} and y testing size {}:".format(X_val.shape,y_val.shape))

X training size: (266369, 38) and y training size (266369,):
X validation size: (29597, 38) and y testing size (29597,):


In [63]:
cat_col = X_train.select_dtypes('object').columns

In [65]:
cat_col

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'],
      dtype='object')

In [66]:
for i,j in enumerate(X_train.columns):
    print(i,j)

0 AMT_CREDIT
1 AMT_ANNUITY
2 REGION_POPULATION_RELATIVE
3 DAYS_REGISTRATION
4 EXT_SOURCE_2
5 DEF_60_CNT_SOCIAL_CIRCLE
6 DAYS_LAST_PHONE_CHANGE
7 EXT_SOURCE_1
8 EXT_SOURCE_3
9 CREDIT_INCOME_PERCENT
10 ANNUITY_INCOME_PERCENT
11 CREDIT_TERM
12 DAYS_EMPLOYED_PERCENT
13 RETIRED
14 DAYS_BIRTH
15 DAYS_EMPLOYED
16 DAYS_ID_PUBLISH
17 FLAG_EMP_PHONE
18 FLAG_WORK_PHONE
19 FLAG_PHONE
20 REGION_RATING_CLIENT
21 REGION_RATING_CLIENT_W_CITY
22 HOUR_APPR_PROCESS_START
23 REG_REGION_NOT_WORK_REGION
24 REG_CITY_NOT_LIVE_CITY
25 REG_CITY_NOT_WORK_CITY
26 LIVE_CITY_NOT_WORK_CITY
27 FLAG_DOCUMENT_3
28 FLAG_DOCUMENT_6
29 FLAG_DOCUMENT_8
30 NAME_CONTRACT_TYPE
31 CODE_GENDER
32 FLAG_OWN_CAR
33 NAME_INCOME_TYPE
34 NAME_EDUCATION_TYPE
35 NAME_FAMILY_STATUS
36 NAME_HOUSING_TYPE
37 AMT_INCOME_TOTAL_GROUP


In [67]:
#Import the SMOTE-NC
from imblearn.over_sampling import SMOTENC
#Create the oversampler. 
smotenc = SMOTENC([30,31,32,33,34,35,36,37],random_state = 101)
X_oversample, y_oversample = smotenc.fit_resample(X_train, y_train)

In [68]:
X_train = X_oversample
y_train = y_oversample.astype('int')
y_val = y_val.astype('int')

In [69]:
print(X_train.shape)
print(X_val.shape)

(489566, 38)
(29597, 38)


In [79]:
train_df = pd.concat([X_train,X_val])

In [80]:
float_col = train_df.select_dtypes(['float']).columns
float_col = float_col.values.tolist() +['DAYS_BIRTH','DAYS_EMPLOYED','DAYS_ID_PUBLISH']

In [81]:
for feature in float_col:
    #first normalize training dataset
    #get ids of training datasets
    ids_train = train_df[feature].index.values
    
    #perform standarization
    vals_train = train_df[feature].values.reshape(-1, 1)
    
    scaler = StandardScaler()
    scaler.fit(vals_train)
    # scaler = StandardScaler().fit(vals_train)
    x_train = scaler.transform(vals_train)

    #update training values
    X_update_train = pd.DataFrame(x_train.flatten(), columns=[feature], index=ids_train)
    train_df.update(X_update_train)

In [83]:
int_col = train_df.select_dtypes(['integer']).columns
int_table = train_df[int_col]
#integer one-hot-encoding
train_df = pd.get_dummies(train_df, columns = int_table.columns, sparse=True)

In [85]:
other_features = train_df.select_dtypes(['object']).columns
# One hot encode all the string fields
drop_features =[]
one_hot_features = []
for f in other_features:
    #get unique values from feature
    idx = train_df.groupby(f).size().index
    mapper = {f:i for i,f in enumerate(idx)}
    
    #lets create new map fields
    one_hot = f + "_MAP"
    train_df[one_hot] = train_df[f].map(mapper)

    drop_features.append(f)
    one_hot_features.append(one_hot)

#lets drop original features 
train_df.drop(drop_features,axis=1,inplace=True)

#one-hot-encoding
train_df = pd.get_dummies(train_df, columns=one_hot_features, sparse=True)

In [87]:
X_train = train_df.iloc[:489566,:]
X_val =train_df.iloc[489566:,:]

In [88]:
#ml algorithms
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold 
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#nerual networks api
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from keras import optimizers
import tensorflow as tf

import seaborn as sns
sns.set(style="whitegrid")
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

In [89]:

final_model = LGBMClassifier(learning_rate= 0.04,
                             max_depth= 20,
                             n_estimators= 300,
                             num_leaves= 25,
                             random_state = 54)

final_model.fit(X_train, y_train ,verbose=50, 
                eval_set=[(X_train, y_train),(X_val, y_val)], 
                eval_metric= 'auc')

[50]	training's auc: 0.923974	training's binary_logloss: 0.416911	valid_1's auc: 0.63763	valid_1's binary_logloss: 0.458442
[100]	training's auc: 0.950088	training's binary_logloss: 0.317739	valid_1's auc: 0.649792	valid_1's binary_logloss: 0.380289
[150]	training's auc: 0.961181	training's binary_logloss: 0.261657	valid_1's auc: 0.660677	valid_1's binary_logloss: 0.338878
[200]	training's auc: 0.965437	training's binary_logloss: 0.233106	valid_1's auc: 0.669712	valid_1's binary_logloss: 0.318943
[250]	training's auc: 0.967507	training's binary_logloss: 0.217604	valid_1's auc: 0.674996	valid_1's binary_logloss: 0.309046
[300]	training's auc: 0.969109	training's binary_logloss: 0.205737	valid_1's auc: 0.679193	valid_1's binary_logloss: 0.301582


In [91]:
from sklearn.metrics import confusion_matrix
y_pred = final_model.predict(X_val)
confusion_matrix(y_val, y_pred)

array([[26257,   942],
       [ 2169,   229]], dtype=int64)

In [92]:
LGBM_conf= confusion_matrix(y_test.values, y_pred)
labels = ['Not Default', 'Default']

fig = plt.figure(figsize=(5,5))
plot_confusion_matrix(LGBM_conf, labels, title="LGBM\n Confusion Matrix", cmap=plt.cm.Oranges, normalize=True)

NameError: name 'y_test' is not defined