In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import impute
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [8]:
application_train = pd.read_csv('./data/application_train.csv')
application_test = pd.read_csv('./data/application_test.csv')

In [9]:
application_train.replace('XNA', np.NaN, inplace = True)
application_train['age_yrs'] = np.negative(application_train['DAYS_BIRTH'])/365
application_train['yrs_emp'] = np.negative(application_train['DAYS_EMPLOYED'])/365
application_train['yrs_registration'] = np.negative(application_train['DAYS_REGISTRATION'])/365
application_train['yrs_id_publish'] = np.negative(application_train['DAYS_ID_PUBLISH'])/365
application_train['yrs_last_phone_change'] = np.negative(application_train['DAYS_LAST_PHONE_CHANGE'])/365
application_train = application_train.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE'], axis = 1)

application_test.replace('XNA', np.NaN, inplace = True)
application_test['age_yrs'] = np.negative(application_test['DAYS_BIRTH'])/365
application_test['yrs_emp'] = np.negative(application_test['DAYS_EMPLOYED'])/365
application_test['yrs_registration'] = np.negative(application_test['DAYS_REGISTRATION'])/365
application_test['yrs_id_publish'] = np.negative(application_test['DAYS_ID_PUBLISH'])/365
application_test['yrs_last_phone_change'] = np.negative(application_test['DAYS_LAST_PHONE_CHANGE'])/365
application_test = application_test.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE'], axis = 1)

In [10]:
inc_anomalies = application_train[(application_train.AMT_INCOME_TOTAL - np.mean(application_train.AMT_INCOME_TOTAL))/np.std(application_train.AMT_INCOME_TOTAL) > 3]
application_train.AMT_INCOME_TOTAL.replace(inc_anomalies.AMT_INCOME_TOTAL.values, np.mean(application_train.AMT_INCOME_TOTAL), inplace = True)

# replacing one aberration with mean
application_train.OBS_30_CNT_SOCIAL_CIRCLE.replace(max(application_train.OBS_30_CNT_SOCIAL_CIRCLE), np.mean(application_train.OBS_30_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.DEF_30_CNT_SOCIAL_CIRCLE.replace(max(application_train.DEF_30_CNT_SOCIAL_CIRCLE), np.mean(application_train.DEF_30_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.OBS_60_CNT_SOCIAL_CIRCLE.replace(max(application_train.OBS_60_CNT_SOCIAL_CIRCLE), np.mean(application_train.OBS_60_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.DEF_60_CNT_SOCIAL_CIRCLE.replace(max(application_train.DEF_60_CNT_SOCIAL_CIRCLE), np.mean(application_train.DEF_60_CNT_SOCIAL_CIRCLE), inplace = True)

In [11]:
# standardizing numerical variables in train
numerical_var = application_train.dtypes[application_train.dtypes != 'object'].drop(['SK_ID_CURR', 'TARGET'])
application_train[numerical_var.index] = application_train[numerical_var.index].apply(lambda x: (x - np.nanmean(x)) / np.nanstd(x), axis = 0)

In [12]:
# proportion of missing in categorical variables
categorical_var = application_train.dtypes[application_train.dtypes == 'object']
cat_df = application_train[categorical_var.index]
cat_missing = pd.isnull(cat_df).sum(axis = 0)[pd.isnull(cat_df).sum(axis = 0) > 0]/cat_df.shape[0]

# proportion of missing in numerical variables
numerical_var = application_train.dtypes[application_train.dtypes != 'object']
num_df = application_train[numerical_var.index]
num_missing = pd.isnull(num_df).sum(axis = 0)[pd.isnull(num_df).sum(axis = 0) > 0]/num_df.shape[0]

In [13]:
application_train1 = application_train.copy()

#### Logistic Regression with First Missingness Imputation Strategy

In [7]:
# Imputation
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'most_frequent')
application_train[cat_missing.index] = imp.fit_transform(application_train[cat_missing.index])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'mean')
application_train[num_missing.index] = imp.fit_transform(application_train[num_missing.index])

# Dummification of categorical variables
logistic_train = application_train.copy()
logistic_test = application_test.copy()

dummy_train = logistic_train[categorical_var.index]
dummy_test = logistic_test[categorical_var.index]

dummy_train['code'] = 1
dummy_test['code'] = 0

dummy_joined = pd.concat([dummy_train, dummy_test])
dummy_joined = pd.get_dummies(dummy_joined, drop_first = True)

dummy_train = dummy_joined[dummy_joined.code == 1]
dummy_test = dummy_joined[dummy_joined.code == 0]

logistic_train = logistic_train.drop(categorical_var.index, axis=1)
logistic_test = logistic_test.drop(categorical_var.index, axis=1)

logistic_train = pd.concat([logistic_train, dummy_train], axis = 1)
logistic_test = pd.concat([logistic_test, dummy_test], axis = 1)

# Train-Test Split
one_index = logistic_train[logistic_train.TARGET == 1].index
zero_index = logistic_train[logistic_train.TARGET == 0].index

trainindex1 = np.random.choice(one_index, size = int(0.7*one_index.shape[0]), replace = False)
trainindex0 = np.random.choice(zero_index, size = int(0.7*zero_index.shape[0]), replace = False)
trainindex = np.concatenate([trainindex1, trainindex0])
testindex = np.delete(logistic_train.index, trainindex)

fit_df = logistic_train.iloc[trainindex]
validation_df = logistic_train.iloc[testindex]

X_fit = fit_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_fit = fit_df['TARGET']
X_validation = validation_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_validation = validation_df['TARGET']

# Model fit and prediction
roc_fit = []
roc_validation = []
for c in [0.01, 0.1, 1, 10, 1000]:
    logitmodel = linear_model.LogisticRegression(C = c, class_weight = 'balanced')
    logitmodel.fit(X_fit, y_fit)

    y_pred_fit = logitmodel.predict(X_fit)
    roc_fit.append(metrics.roc_auc_score(y_fit, y_pred_fit))

    y_pred_validation = logitmodel.predict(X_validation)
    roc_validation.append(metrics.roc_auc_score(y_validation, y_pred_validation))
print(roc_fit)
print(roc_validation)

[0.687298105792378, 0.6884280174825237, 0.6880477688578055, 0.6879875363900797, 0.6880258484948221]
[0.6751383102390284, 0.6755991221944275, 0.6766814100739129, 0.6764446388703472, 0.6764845701542155]


#### Logistic Regression with Second Missingness Imputation Strategy

In [14]:
application_train = application_train1.copy()

# Missingness imputation in train
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_train[cat_missing.index] = imp.fit_transform(application_train[cat_missing.index])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'mean')
application_train[num_missing.index] = imp.fit_transform(application_train[num_missing.index])

# Missingness imputation in test
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_test[cat_missing.index] = imp.fit_transform(application_test[cat_missing.index])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'mean')
application_test[num_missing.index] = imp.fit_transform(application_test[num_missing.index])

# Dummification of categorical variables
logistic_train = application_train.copy()
logistic_test = application_test.copy()

dummy_train = logistic_train[categorical_var.index]
dummy_test = logistic_test[categorical_var.index]

dummy_train['code'] = 1
dummy_test['code'] = 0

dummy_joined = pd.concat([dummy_train, dummy_test])
dummy_joined = pd.get_dummies(dummy_joined, drop_first = True)

dummy_train = dummy_joined[dummy_joined.code == 1]
dummy_test = dummy_joined[dummy_joined.code == 0]

logistic_train = logistic_train.drop(categorical_var.index, axis=1)
logistic_test = logistic_test.drop(categorical_var.index, axis=1)

logistic_train = pd.concat([logistic_train, dummy_train], axis = 1)
logistic_test = pd.concat([logistic_test, dummy_test], axis = 1)

# Train-Test Split
one_index = logistic_train[logistic_train.TARGET == 1].index
zero_index = logistic_train[logistic_train.TARGET == 0].index

trainindex1 = np.random.choice(one_index, size = int(0.7*one_index.shape[0]), replace = False)
trainindex0 = np.random.choice(zero_index, size = int(0.7*zero_index.shape[0]), replace = False)
trainindex = np.concatenate([trainindex1, trainindex0])
testindex = np.delete(logistic_train.index, trainindex)

fit_df = logistic_train.iloc[trainindex]
validation_df = logistic_train.iloc[testindex]

X_fit = fit_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_fit = fit_df['TARGET']
X_validation = validation_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_validation = validation_df['TARGET']

# Model fit and prediction
roc_fit = []
roc_validation = []
for c in [1, 10]:
    logitmodel = linear_model.LogisticRegression(C = c, class_weight = 'balanced')
    logitmodel.fit(X_fit, y_fit)

    y_pred_fit = logitmodel.predict(X_fit)
    roc_fit.append(metrics.roc_auc_score(y_fit, y_pred_fit))

    y_pred_validation = logitmodel.predict(X_validation)
    roc_validation.append(metrics.roc_auc_score(y_validation, y_pred_validation))
print(roc_fit)
print(roc_validation)

[0.6855309695036478, 0.6856081624103685]
[0.6831293102576974, 0.6829760192169692]


#### Logistic Regression with Third Missingness Imputation Strategy

In [15]:
application_train = application_train1.copy()

# # Imputation
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_train[cat_missing.index] = imp.fit_transform(application_train[cat_missing.index])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'median')
application_train[num_missing.index] = imp.fit_transform(application_train[num_missing.index])

# pd.isnull(application_train).sum(axis = 0)[pd.isnull(application_train).sum(axis = 0) > 0]/application_train.shape[0]

# # Dummification of categorical variables
logistic_train = application_train.copy()
logistic_test = application_test.copy()

dummy_train = logistic_train[categorical_var.index]
dummy_test = logistic_test[categorical_var.index]

dummy_train['code'] = 1
dummy_test['code'] = 0

dummy_joined = pd.concat([dummy_train, dummy_test])
dummy_joined = pd.get_dummies(dummy_joined, drop_first = True)

dummy_train = dummy_joined[dummy_joined.code == 1]
dummy_test = dummy_joined[dummy_joined.code == 0]

logistic_train = logistic_train.drop(categorical_var.index, axis=1)
logistic_test = logistic_test.drop(categorical_var.index, axis=1)

logistic_train = pd.concat([logistic_train, dummy_train], axis = 1)
logistic_test = pd.concat([logistic_test, dummy_test], axis = 1)

# # Train-Test Split
one_index = logistic_train[logistic_train.TARGET == 1].index
zero_index = logistic_train[logistic_train.TARGET == 0].index

trainindex1 = np.random.choice(one_index, size = int(0.7*one_index.shape[0]), replace = False)
trainindex0 = np.random.choice(zero_index, size = int(0.7*zero_index.shape[0]), replace = False)
trainindex = np.concatenate([trainindex1, trainindex0])
testindex = np.delete(logistic_train.index, trainindex)

fit_df = logistic_train.iloc[trainindex]
validation_df = logistic_train.iloc[testindex]

X_fit = fit_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_fit = fit_df['TARGET']
X_validation = validation_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_validation = validation_df['TARGET']

# # Model fit and prediction
roc_fit = []
roc_validation = []
for c in [0.01, 1, 1000]:
    logitmodel = linear_model.LogisticRegression(C = c, class_weight = 'balanced')
    logitmodel.fit(X_fit, y_fit)

    y_pred_fit = logitmodel.predict(X_fit)
    roc_fit.append(metrics.roc_auc_score(y_fit, y_pred_fit))

    y_pred_validation = logitmodel.predict(X_validation)
    roc_validation.append(metrics.roc_auc_score(y_validation, y_pred_validation))
print(roc_fit)
print(roc_validation)

[0.6817085567509356, 0.6817108315398795, 0.6819721629697187]
[0.6872708597847876, 0.6889128489961278, 0.6891514978661224]
