In [1]:
import numpy as np
import pandas as pd 
import pickle

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load original and pickled files
app_train = pd.read_csv('data/application_train.csv')
app_test = pd.read_csv('data/application_test.csv')
app_train_poly = pickle.load( open( "data_engineered/app_train_poly", "rb" ) )
app_test_poly = pickle.load( open( "data_engineered/app_test_poly", "rb" ) )

In [3]:
# One hot encode data again
# Create a label encoder object
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [4]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 239)


In [5]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 239)


In [7]:
# Save these new only numeric dataframes
fname_1 = 'data_engineered/app_train_encoded'
fobject_1 = open(fname_1, 'wb')
pickle.dump(app_train, fobject_1)
fobject_1.close()

In [8]:
fname_2 = 'data_engineered/app_test_encoded'
fobject_2 = open(fname_2, 'wb')
pickle.dump(app_test, fobject_2)
fobject_2.close()

# Baseline model

In [9]:
from sklearn.preprocessing import Imputer, MinMaxScaler

In [10]:
# Drop target and make copy, same for test
if 'TARGET' in app_train:
    train = app_train.drop(columns = ['TARGET'])
else:
    train = app_train.copy()
test = app_test.copy()

In [11]:
%%time
# Fit and transform imputer and scaler
imputer = Imputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0,1))
train = imputer.fit_transform(train)
train = scaler.fit_transform(train)

test = imputer.fit_transform(test)
test = scaler.fit_transform(test)

print('Imputing and scaling done.')

Imputing and scaling done.
Wall time: 21.4 s


#### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
logreg = LogisticRegression(C = 0.0001) # Strongly regularized
logreg.fit(train, app_train['TARGET'])

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [14]:
logreg.predict_proba(test)                       # Probability of sample belonging to each class. We need prob of defaulting

array([[0.93363907, 0.06636093],
       [0.87200315, 0.12799685],
       [0.91657975, 0.08342025],
       ...,
       [0.94385591, 0.05614409],
       [0.92646519, 0.07353481],
       [0.91053617, 0.08946383]])

In [15]:
log_reg_prob = logreg.predict_proba(test)[:,1]

In [16]:
# Create submission file
submit_log_reg = app_test[['SK_ID_CURR']]
submit_log_reg['TARGET'] = log_reg_prob
submit_log_reg.to_csv('submissions/submit_log_reg.csv', index = False)

In [17]:
submit_log_reg.shape

(48744, 2)

In [18]:
submit_log_reg.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.066361
1,100005,0.127997
2,100013,0.08342
3,100028,0.059038
4,100038,0.127732


#### Random Forest

In [28]:
feature_list = list(app_train.drop(columns=['TARGET']).columns)

In [29]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
# Train on the training data
random_forest.fit(train, app_train['TARGET'])

# Extract feature importances
feature_importance = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': feature_list, 'importance': feature_importance})

# Make predictions on the test data
random_forest_prob = random_forest.predict_proba(test)[:, 1]

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished


In [30]:
# Create submission file
submit_random_forest = app_test[['SK_ID_CURR']]
submit_random_forest['TARGET'] = random_forest_prob
submit_random_forest.to_csv('submissions/submit_random_forest.csv', index = False)