**Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing.** Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

%matplotlib inline

In [12]:
try:
    file = 'C:/Users/18047/Documents/Main/3 Deeper into supervised learning/Challenges/3 Dataset.csv'
    df = pd.read_csv(file)
except:
    file = 'C:/Users/Carter Carlson/Documents/Thinkful/Coursework/3 Deeper into supervised learning/Challenges/3 Dataset.csv'
    df = pd.read_csv(file)
df.head()

Unnamed: 0,FeeID,BuildingID,BoroID,Boro,HouseNumber,StreetName,Zip,Block,Lot,LifeCycle,FeeTypeID,FeeType,FeeSourceTypeID,FeeSourceType,FeeSourceID,FeeIssuedDate,FeeAmount,DoFAccountType,DoFTransferDate
0,441,183598,3,BROOKLYN,227,ALBANY AVENUE,11213.0,1370,14,Building,1,Initial Re-inspection Fee,51,PROJECT BLDG,30305,2008-05-23T00:00:00,2000.0,236,2008-06-20T00:00:00
1,442,381566,3,BROOKLYN,232,TOMPKINS AVENUE,11216.0,1785,39,Building,1,Initial Re-inspection Fee,51,PROJECT BLDG,30305,2008-05-23T00:00:00,1500.0,236,2008-06-20T00:00:00
2,443,330335,3,BROOKLYN,786,MACON STREET,11233.0,1497,18,Building,1,Initial Re-inspection Fee,51,PROJECT BLDG,30305,2008-05-23T00:00:00,1500.0,236,2008-06-20T00:00:00
3,444,357697,3,BROOKLYN,1109,PUTNAM AVENUE,11221.0,3366,54,Building,1,Initial Re-inspection Fee,51,PROJECT BLDG,30305,2008-05-23T00:00:00,1500.0,236,2008-06-20T00:00:00
4,445,381865,3,BROOKLYN,237,TROUTMAN STREET,11237.0,3174,42,Building,1,Initial Re-inspection Fee,51,PROJECT BLDG,30305,2008-05-23T00:00:00,3000.0,236,2008-06-20T00:00:00


In [13]:
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

Boro
5
HouseNumber
3049
StreetName
1729
LifeCycle
4
FeeType
7
FeeSourceType
4
FeeIssuedDate
2940
DoFTransferDate
125


In [14]:
# We'll remove any features that have over 200 unique values,
# and the Boro column that lists location by name instead of number
df = df.drop(['HouseNumber','StreetName','FeeIssuedDate','Boro'], axis=1)

# Also remove Zipcode and block number, as it will overfit the decision tree
df = df.drop(['Zip','Block'], axis=1)

In [23]:
def get_accuracy(classifier):
    accuracy = cross_val_score(classifier, X, Y, cv=5).mean()
    return accuracy

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

X = df.drop('BoroID', axis=1)
Y = df['BoroID']
X = pd.get_dummies(X)

rfc = RandomForestClassifier()
mlp = MLPClassifier()

rfc.fit(X, Y)
mlp.fit(X, Y)

classifiers = {'Random Forest':rfc, 'Multi-layer Perceptron':mlp}

for i in classifiers:
    start_time = time.time()
    print('{} accuracy: {:0.2f}'.format(i, get_accuracy(classifiers[i])))
    print('{} time to run: {:1.2f} seconds'.format(i, time.time() - start_time))

Random Forest accuracy: 0.68
Random Forest time to run: 2.25 seconds
Multi-layer Perceptron accuracy: 0.71
Multi-layer Perceptron time to run: 18.81 seconds


In [25]:
from sklearn.model_selection import RandomizedSearchCV

# Collection of possible parameters
rfc_param = {'n_estimators': [5, 10, 20],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4],
             'random_state': [None, 42]}

mlp_param = {'hidden_layer_sizes': [(10,), (100,)],
             'solver': ['lbfgs', 'adam'],
             'max_iter': [50, 100, 200],
             'alpha': [0.0001, 0.001, 0.01]}

# Determine the best parameters
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=rfc_param)
mlp_random = RandomizedSearchCV(estimator=mlp, param_distributions=mlp_param)

rfc_random.fit(X, Y)
mlp_random.fit(X, Y)

print('Random Forest best parameters:\n', rfc_random.best_params_)
print('\nMulti-layer Peceptron best parameters:\n', mlp_random.best_params_)
print('--------------------------------------')

classifiers = {'Random Forest':rfc, 'Multi-layer Perceptron':mlp}

for i in classifiers:
    start_time = time.time()
    print('{} accuracy (optimized): {:0.2f}'.format(i, get_accuracy(classifiers[i])))
    print('{} time to run (optimized): {:1.2f} seconds'.format(i, time.time() - start_time))

Random Forest best parameters:
 {'random_state': 42, 'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 4}

Multi-layer Peceptron best parameters:
 {'solver': 'adam', 'max_iter': 50, 'hidden_layer_sizes': (100,), 'alpha': 0.0001}
--------------------------------------
Random Forest accuracy (optimized): 0.62
Random Forest time to run (optimized): 2.26 seconds
Multi-layer Perceptron accuracy (optimized): 0.76
Multi-layer Perceptron time to run (optimized): 18.29 seconds
