In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import impute
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

#### Read file and make initial replacements

In [2]:
application_train = pd.read_csv('./data/application_train.csv')
application_test = pd.read_csv('./data/application_test.csv')

application_train.replace('XNA', np.NaN, inplace = True)
application_train['age_yrs'] = np.negative(application_train['DAYS_BIRTH'])/365
application_train['yrs_emp'] = np.negative(application_train['DAYS_EMPLOYED'])/365
application_train['yrs_registration'] = np.negative(application_train['DAYS_REGISTRATION'])/365
application_train['yrs_id_publish'] = np.negative(application_train['DAYS_ID_PUBLISH'])/365
application_train['yrs_last_phone_change'] = np.negative(application_train['DAYS_LAST_PHONE_CHANGE'])/365
application_train = application_train.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE'], axis = 1)

application_test.replace('XNA', np.NaN, inplace = True)
application_test['age_yrs'] = np.negative(application_test['DAYS_BIRTH'])/365
application_test['yrs_emp'] = np.negative(application_test['DAYS_EMPLOYED'])/365
application_test['yrs_registration'] = np.negative(application_test['DAYS_REGISTRATION'])/365
application_test['yrs_id_publish'] = np.negative(application_test['DAYS_ID_PUBLISH'])/365
application_test['yrs_last_phone_change'] = np.negative(application_test['DAYS_LAST_PHONE_CHANGE'])/365
application_test = application_test.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE'], axis = 1)

# imputing self-proclaimed-income anomalies
inc_anomalies = application_train[(application_train.AMT_INCOME_TOTAL - np.mean(application_train.AMT_INCOME_TOTAL))/np.std(application_train.AMT_INCOME_TOTAL) > 3]
application_train.AMT_INCOME_TOTAL.replace(inc_anomalies.AMT_INCOME_TOTAL.values, np.mean(application_train.AMT_INCOME_TOTAL), inplace = True)

# replacing one aberration with mean
application_train.OBS_30_CNT_SOCIAL_CIRCLE.replace(max(application_train.OBS_30_CNT_SOCIAL_CIRCLE), np.mean(application_train.OBS_30_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.DEF_30_CNT_SOCIAL_CIRCLE.replace(max(application_train.DEF_30_CNT_SOCIAL_CIRCLE), np.mean(application_train.DEF_30_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.OBS_60_CNT_SOCIAL_CIRCLE.replace(max(application_train.OBS_60_CNT_SOCIAL_CIRCLE), np.mean(application_train.OBS_60_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.DEF_60_CNT_SOCIAL_CIRCLE.replace(max(application_train.DEF_60_CNT_SOCIAL_CIRCLE), np.mean(application_train.DEF_60_CNT_SOCIAL_CIRCLE), inplace = True)

application_train1 = application_train.copy()
application_test1 = application_test.copy()

#### Missing values imputation as per best strategy

In [3]:
# Categorical variables having missing values
cat_var = application_train.dtypes[application_train.dtypes == 'object'].index

# Numerical variables having missing values
num_var = application_train.dtypes[application_train.dtypes != 'object'].index[2:]

# IMPUTATION of missing values
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_train[cat_var] = imp.fit_transform(application_train[cat_var])
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_test[cat_var] = imp.fit_transform(application_test[cat_var])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'median')
application_train[num_var] = imp.fit_transform(application_train[num_var])
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'median')
application_test[num_var] = imp.fit_transform(application_test[num_var])

application_train2 = application_train.copy()
application_test2 = application_test.copy()

#### Dummification and standardization

In [4]:
# DUMMIFICATION of categorical variables
logistic_train = application_train.copy()
dummy_train = logistic_train[cat_var]
dummy_train['code'] = 1

logistic_test = application_test.copy()
dummy_test = logistic_test[cat_var]
dummy_test['code'] = 0

dummy_joined = pd.concat([dummy_train, dummy_test])
dummy_joined = pd.get_dummies(dummy_joined, drop_first = True)

dummy_train = dummy_joined[dummy_joined.code == 1]
logistic_train = logistic_train.drop(cat_var, axis=1)
logistic_train = pd.concat([logistic_train, dummy_train], axis = 1)
logistic_train = logistic_train.drop('code', axis = 1)

dummy_test = dummy_joined[dummy_joined.code == 0]
logistic_test = logistic_test.drop(cat_var, axis=1)
logistic_test = pd.concat([logistic_test, dummy_test], axis = 1)
logistic_test = logistic_test.drop('code', axis = 1)


# STANDARDIZATION of variables
logistic_train.iloc[:,2:] = logistic_train.iloc[:,2:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)

logistic_test.iloc[:,1:] = logistic_test.iloc[:,1:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)
logistic_test[['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19',
       'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'CODE_GENDER_xan',
       'NAME_INCOME_TYPE_Maternity leave', 'NAME_FAMILY_STATUS_Unknown']] = 0

application_train = logistic_train.copy()
application_test = logistic_test.copy()

### Applying Logistic Model

#### Searching for best parameters

#### Predicting TARGET in test as per best C