# The Multi-Layer Perceptron Neural Network Classifier

The MLP classifier is a Neural Network algorithm that is capable of building complex models, efficiently used on large datasets (such as ours). The fact that efficiency is important for this large dataset, a MLP was thought of a good selection. This method of machine learning is sensitive to scaling so keep in mind that we will be scaling

In [1]:
# First things first, let's load our features in

In [2]:
# Import Data
# MUTATE DATAFRAMES ACCORDING TO THE EXPLORATORY DATA ANALYSIS CODE

#For data Manipulation
import numpy as np
import pandas as pd
#In order to show all columns available
pd.set_option('display.max_columns', 200)

#Sklearn imports
from sklearn.preprocessing import LabelEncoder, Imputer

#Graphing libs
import matplotlib.pyplot as plt
import seaborn as sns

apptrain = pd.read_csv('../Dataset/application_train.csv')
apptest = pd.read_csv('../Dataset/application_test.csv')

# Code that modifies dataframes
apptrain['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
apptest['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
apptrain['DAYS_BIRTH'] = abs(apptrain['DAYS_BIRTH'])
apptest['DAYS_BIRTH'] = abs(apptrain['DAYS_BIRTH'])

In [3]:
# One-hot encoding and dataframe alignment
le = LabelEncoder()
le_count = 0

# Iterate through columns
for col in apptrain:
    if apptrain[col].dtype == "object":
        if len(list(apptrain[col].unique())) <= 2:
            #train on the training data
            le.fit(apptrain[col])
            #transform both training and testing data
            apptrain[col] = le.transform(apptrain[col])
            apptest[col] = le.transform(apptest[col])
            
            le_count += 1
            

            
#One-Hot encoding
apptrain = pd.get_dummies(apptrain)
apptest = pd.get_dummies(apptest)



print('Training features shape: {}'.format(apptrain.shape))
print('Training features shape: {}'.format(apptest.shape))
print('{} columns were label encoded'.format(le_count))

Training features shape: (307511, 243)
Training features shape: (48744, 239)
3 columns were label encoded


In [4]:
# Take the labels out of the training dataset as an inner merge will erase them since the test dataset does not have the targets
train_labels = apptrain['TARGET']


#aligning the training and testing data, keep only columns present in both df's
apptrain, apptest = apptrain.align(apptest, join = 'inner', axis = 1)
apptrain['TARGET'] = train_labels

print('Training Features shape: ', apptrain.shape)
print('Testing features shape: ', apptest.shape)
print("We're back on track, remember the training dataset will have one column more since it DOES have the targets")

Training Features shape:  (307511, 240)
Testing features shape:  (48744, 239)
We're back on track, remember the training dataset will have one column more since it DOES have the targets


In [6]:
# Scaling not very much required for Random Forest models
# Creating base df's for machine learning model
training_data = apptrain.drop(columns = ['TARGET'])
testing_data = apptest.copy()

# In the dataframes we still have missing values, WE USE IMPUTATION HERE
imputer = Imputer(strategy = 'median')
imputer.fit(training_data)
imputer.fit(testing_data)
training_data = imputer.transform(training_data)
testing_data = imputer.transform(testing_data)

# We must scale the data as well.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

training_data = scaler.fit_transform(training_data)
testing_data = scaler.transform(testing_data)


print('training data shape', training_data.shape)
print('testing data shape', testing_data.shape)

training data shape (307511, 239)
testing data shape (48744, 239)


# MLP Neural Network (BASE) model

In [7]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(activation='relu', learning_rate='adaptive', max_iter=800)
mlp.fit(training_data, train_labels)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=800, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [11]:
mlp_base_predictions = mlp.predict_proba(testing_data)[:, 1]

def format_and_submit(predictions, desired_file_name):
    submit = apptest[['SK_ID_CURR']]
    submit['TARGET'] = predictions
    submit.to_csv('../Model_Predictions/{}.csv'.format(desired_file_name), index = False)
    

format_and_submit(mlp_base_predictions, 'MLP_NN_Base_Predictions')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


# Using engineered features

In [3]:
# Import our dataframe with the most useful features
fe_training_data = pd.read_csv('rf_important_features.csv')
fe_testing_data = pd.read_csv('rftest_important_features.csv')

train_labels =  fe_training_data['TARGET']
fe_training_data = fe_training_data.drop(columns = 'TARGET')

In [4]:
from sklearn.preprocessing import Imputer, MinMaxScaler
imputer = Imputer(strategy = 'median')

fe_training_data = imputer.fit_transform(fe_training_data)
fe_testing_data = imputer.transform(fe_testing_data)

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(fe_training_data)
fe_training_data = scaler.transform(fe_training_data)
fe_testing_data = scaler.transform(fe_testing_data)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(activation='relu', learning_rate='adaptive', max_iter=1000, verbose=True)
mlp.fit(fe_training_data, train_labels)

Iteration 1, loss = 0.25502566
Iteration 2, loss = 0.25167207
Iteration 3, loss = 0.25083626
Iteration 4, loss = 0.25001302
Iteration 5, loss = 0.24979902
Iteration 6, loss = 0.24950077
Iteration 7, loss = 0.24905607
Iteration 8, loss = 0.24872079
Iteration 9, loss = 0.24837681
Iteration 10, loss = 0.24813010
Iteration 11, loss = 0.24794531
Iteration 12, loss = 0.24757039
Iteration 13, loss = 0.24732049
Iteration 14, loss = 0.24700897
Iteration 15, loss = 0.24670407
Iteration 16, loss = 0.24668559
Iteration 17, loss = 0.24635933
Iteration 18, loss = 0.24620464
Iteration 19, loss = 0.24581054
Iteration 20, loss = 0.24570083
Iteration 21, loss = 0.24546659
Iteration 22, loss = 0.24521398
Iteration 23, loss = 0.24522515
Iteration 24, loss = 0.24484288
Iteration 25, loss = 0.24474555
Iteration 26, loss = 0.24455575
Iteration 27, loss = 0.24434301
Iteration 28, loss = 0.24431514
Iteration 29, loss = 0.24404097
Iteration 30, loss = 0.24392111
Iteration 31, loss = 0.24388113
Iteration 32, los