In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import impute
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

#### Read file and make initial replacements

In [2]:
application_train = pd.read_csv('./data/application_train.csv')
application_test = pd.read_csv('./data/application_test.csv')

application_train.replace('XNA', np.NaN, inplace = True)
application_train['age_yrs'] = np.negative(application_train['DAYS_BIRTH'])/365
application_train['yrs_emp'] = np.negative(application_train['DAYS_EMPLOYED'])/365
application_train['yrs_registration'] = np.negative(application_train['DAYS_REGISTRATION'])/365
application_train['yrs_id_publish'] = np.negative(application_train['DAYS_ID_PUBLISH'])/365
application_train['yrs_last_phone_change'] = np.negative(application_train['DAYS_LAST_PHONE_CHANGE'])/365
application_train = application_train.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE'], axis = 1)

application_test.replace('XNA', np.NaN, inplace = True)
application_test['age_yrs'] = np.negative(application_test['DAYS_BIRTH'])/365
application_test['yrs_emp'] = np.negative(application_test['DAYS_EMPLOYED'])/365
application_test['yrs_registration'] = np.negative(application_test['DAYS_REGISTRATION'])/365
application_test['yrs_id_publish'] = np.negative(application_test['DAYS_ID_PUBLISH'])/365
application_test['yrs_last_phone_change'] = np.negative(application_test['DAYS_LAST_PHONE_CHANGE'])/365
application_test = application_test.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE'], axis = 1)

# imputing self-proclaimed-income anomalies
inc_anomalies = application_train[(application_train.AMT_INCOME_TOTAL - np.mean(application_train.AMT_INCOME_TOTAL))/np.std(application_train.AMT_INCOME_TOTAL) > 3]
application_train.AMT_INCOME_TOTAL.replace(inc_anomalies.AMT_INCOME_TOTAL.values, np.mean(application_train.AMT_INCOME_TOTAL), inplace = True)

# replacing one aberration with mean
application_train.OBS_30_CNT_SOCIAL_CIRCLE.replace(max(application_train.OBS_30_CNT_SOCIAL_CIRCLE), np.mean(application_train.OBS_30_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.DEF_30_CNT_SOCIAL_CIRCLE.replace(max(application_train.DEF_30_CNT_SOCIAL_CIRCLE), np.mean(application_train.DEF_30_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.OBS_60_CNT_SOCIAL_CIRCLE.replace(max(application_train.OBS_60_CNT_SOCIAL_CIRCLE), np.mean(application_train.OBS_60_CNT_SOCIAL_CIRCLE), inplace = True)
application_train.DEF_60_CNT_SOCIAL_CIRCLE.replace(max(application_train.DEF_60_CNT_SOCIAL_CIRCLE), np.mean(application_train.DEF_60_CNT_SOCIAL_CIRCLE), inplace = True)

application_train1 = application_train.copy()
application_test1 = application_test.copy()

In [3]:
# NEW FEATURES
# train
application_train['ANN_CRE'] = application_train['AMT_ANNUITY']/application_train['AMT_CREDIT']
application_train['DEF_OBS_30'] = application_train['DEF_30_CNT_SOCIAL_CIRCLE']/application_train['OBS_30_CNT_SOCIAL_CIRCLE']
application_train['DEF_OBS_60'] = application_train['DEF_60_CNT_SOCIAL_CIRCLE']/application_train['OBS_60_CNT_SOCIAL_CIRCLE']
application_train['CRE_INC'] = application_train['AMT_CREDIT']/application_train['AMT_INCOME_TOTAL']
application_train['ANN_INC'] = application_train['AMT_ANNUITY']/application_train['AMT_INCOME_TOTAL']

application_train['OWN_CAR_AGE_Av'] = -application_train.OWN_CAR_AGE.isnull()
application_train['OCCUPATION_TYPE_Av'] = -application_train.OCCUPATION_TYPE.isnull()
application_train['ORGANIZATION_TYPE_Av'] = -application_train.ORGANIZATION_TYPE.isnull()
application_train['EXT_SOURCE_1_Av'] = -application_train.EXT_SOURCE_1.isnull()
application_train['EXT_SOURCE_3_Av'] = -application_train.EXT_SOURCE_3.isnull()
application_train['HOUSETYPE_Av'] = -application_train.HOUSETYPE_MODE.isnull()
application_train['AMT_REQ_CREDIT_BUREAU_Av'] = -application_train.AMT_REQ_CREDIT_BUREAU_HOUR.isnull()

# test
application_test['ANN_CRE'] = application_test['AMT_ANNUITY']/application_test['AMT_CREDIT']
application_test['DEF_OBS_30'] = application_test['DEF_30_CNT_SOCIAL_CIRCLE']/application_test['OBS_30_CNT_SOCIAL_CIRCLE']
application_test['DEF_OBS_60'] = application_test['DEF_60_CNT_SOCIAL_CIRCLE']/application_test['OBS_60_CNT_SOCIAL_CIRCLE']
application_test['CRE_INC'] = application_test['AMT_CREDIT']/application_test['AMT_INCOME_TOTAL']
application_test['ANN_INC'] = application_test['AMT_ANNUITY']/application_test['AMT_INCOME_TOTAL']

application_test['OWN_CAR_AGE_Av'] = -application_test.OWN_CAR_AGE.isnull()
application_test['OCCUPATION_TYPE_Av'] = -application_test.OCCUPATION_TYPE.isnull()
application_test['ORGANIZATION_TYPE_Av'] = -application_test.ORGANIZATION_TYPE.isnull()
application_test['EXT_SOURCE_1_Av'] = -application_test.EXT_SOURCE_1.isnull()
application_test['EXT_SOURCE_3_Av'] = -application_test.EXT_SOURCE_3.isnull()
application_test['HOUSETYPE_Av'] = -application_test.HOUSETYPE_MODE.isnull()
application_test['AMT_REQ_CREDIT_BUREAU_Av'] = -application_test.AMT_REQ_CREDIT_BUREAU_HOUR.isnull()

application_train2 = application_train.copy()
application_test2 = application_test.copy()

#### Missing values imputation as per best strategy

In [4]:
# Categorical variables having missing values
cat_var = application_train.dtypes[application_train.dtypes == 'object'].index

# Numerical variables having missing values
num_var = application_train.dtypes[application_train.dtypes != 'object'].index[2:]

# IMPUTATION of missing values train
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_train[cat_var] = imp.fit_transform(application_train[cat_var])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'median')
application_train[num_var] = imp.fit_transform(application_train[num_var])

# IMPUTATION of missing values test
imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'constant', fill_value = 'xan')
application_test[cat_var] = imp.fit_transform(application_test[cat_var])

imp = impute.SimpleImputer(missing_values=np.NaN, strategy= 'median')
application_test[num_var] = imp.fit_transform(application_test[num_var])

application_train3 = application_train.copy()
application_test3 = application_test.copy()

#### Dummification and standardization

In [5]:
# DUMMIFICATION of categorical variables
logistic_train = application_train.copy()
dummy_train = logistic_train[cat_var]
dummy_train['code'] = 1

logistic_test = application_test.copy()
dummy_test = logistic_test[cat_var]
dummy_test['code'] = 0

dummy_joined = pd.concat([dummy_train, dummy_test])
dummy_joined = pd.get_dummies(dummy_joined, drop_first = True)

dummy_train = dummy_joined[dummy_joined.code == 1]
logistic_train = logistic_train.drop(cat_var, axis=1)
logistic_train = pd.concat([logistic_train, dummy_train], axis = 1)
logistic_train = logistic_train.drop('code', axis = 1)

dummy_test = dummy_joined[dummy_joined.code == 0]
logistic_test = logistic_test.drop(cat_var, axis=1)
logistic_test = pd.concat([logistic_test, dummy_test], axis = 1)
logistic_test = logistic_test.drop('code', axis = 1)


# STANDARDIZATION of variables 
logistic_train.iloc[:,2:] = logistic_train.iloc[:,2:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)

logistic_test.iloc[:,1:] = logistic_test.iloc[:,1:].apply(lambda x: (x - np.mean(x)) / np.std(x), axis = 0)
logistic_test[['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19',
       'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'CODE_GENDER_xan',
       'NAME_INCOME_TYPE_Maternity leave', 'NAME_FAMILY_STATUS_Unknown']] = 0

application_train3 = logistic_train.copy()
application_test3 = logistic_test.copy()

### Reducing Features in logistic_train

#### Finding importanr features through sklearn.feature_selection

In [6]:
X_train = logistic_train.iloc[:, 2:]
y_train = logistic_train.TARGET

from sklearn import feature_selection
logitmodel = linear_model.LogisticRegression(C = .001, class_weight = 'balanced')
sfm = feature_selection.SelectFromModel(logitmodel)
sfm.fit(X_train, y_train)
feature_index = sfm.get_support()
imp_features = X_train.columns[feature_index]
imp_features

Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'OWN_CAR_AGE', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'REGION_RATING_CLIENT_W_CITY', 'REG_CITY_NOT_LIVE_CITY', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'BASEMENTAREA_AVG', 'NONLIVINGAREA_AVG',
       'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE',
       'FLOORSMAX_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
       'COMMONAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_16',
       'FLAG_DOCUMENT_18', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'age_yrs', 'yrs_emp', 'yrs_registration',
       'yrs_id_publish', 'yrs_last_phone_change', 'ANN_CRE', 'CRE_INC',
       'ANN_INC', 'OWN_CAR_AGE_Av', 'EXT_SOURCE_1_Av',
       'AMT_REQ_CREDIT_BUREAU_Av', 'NAME_CONTRACT_TYPE_Revolving loans',
       'CODE_GENDER_M', 'F

In [7]:
logistic_train_red = pd.concat([logistic_train[['SK_ID_CURR', 'TARGET']], logistic_train[imp_features]], axis = 1)

In [8]:
# Train-Test Split
one_index = logistic_train_red[logistic_train_red.TARGET == 1].index
zero_index = logistic_train_red[logistic_train_red.TARGET == 0].index

trainindex1 = np.random.choice(one_index, size = int(0.7*one_index.shape[0]), replace = False)
trainindex0 = np.random.choice(zero_index, size = int(0.7*zero_index.shape[0]), replace = False)
trainindex = np.concatenate([trainindex1, trainindex0])
testindex = np.delete(logistic_train_red.index, trainindex)

fit_df = logistic_train_red.iloc[trainindex]
validation_df = logistic_train_red.iloc[testindex]

X_fit = fit_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_fit = fit_df['TARGET']
X_validation = validation_df.drop(['SK_ID_CURR', 'TARGET'], axis = 1)
y_validation = validation_df['TARGET']

# # Model fit and prediction

logitmodel = linear_model.LogisticRegression(C = .001, class_weight = 'balanced')
logitmodel.fit(X_fit, y_fit)

y_pred_fit = logitmodel.predict(X_fit)
print(metrics.roc_auc_score(y_fit, y_pred_fit))

y_pred_validation = logitmodel.predict(X_validation)
print(metrics.roc_auc_score(y_validation, y_pred_validation))

0.6851811290954184
0.6867243021179343


In [9]:
'We have reduced from {} features to {} features without significant loss in validation roc_auc_score'.format(logistic_train.shape[1]-2, logistic_train_red.shape[1]-2)

'We have reduced from 246 features to 68 features without significant loss in validation roc_auc_score'

In [10]:
logistic_test_red = pd.concat([logistic_test[['SK_ID_CURR']], logistic_test[imp_features]], axis = 1)

#### Predicting TARGET in test reduced features

In [11]:
# X_train = logistic_train_red.iloc[:, 2:]
# y_train = logistic_train_red.TARGET
# X_test = logistic_test_red.iloc[:, 1:]

# # Model fit and prediction
# logitmodel = linear_model.LogisticRegression(C = 0.1, class_weight = 'balanced')
# logitmodel.fit(X_train, y_train)
# y_pred = logitmodel.predict(X_test)
# sol_log = pd.DataFrame({'SK_ID_CURR': logistic_test.iloc[:, 0], 'TARGET': y_pred})
# sol_log.to_csv('./soln/sol_log2.csv', index = False)                   # kaggle: 0.68594 

In [12]:
# logistic_train_red.to_csv('./data/app_train_mod.csv', index=False)
# logistic_test_red.to_csv('./data/app_test_mod.csv', index=False)