# NYC - Validation

In this notebook we run our models that have been trained on CBECS on the PLUTO/LL84 validation dataset.

These results do not require cross validation as we use models trained on _all_ of CBECS tested on _all_ of the validation dataset.

In [1]:
%matplotlib inline
import sys
import os
import time

import cPickle as pickle

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')

#sklearn Utility Functions
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score

In [2]:
metrics = [
    mean_absolute_error,
    median_absolute_error,
    r2_score
]
metricNames = [
    "Mean Absolute Error",
    "Median Absolute Error",
    "R2"
]
assert len(metrics) == len(metricNames)
numMetrics = len(metrics)

# Train/Test Models with NYC Data

In [3]:
def load_NYC():
    X_val = np.load("output/nyc/ll84_X_2016.npy")
    Y_val = np.load("output/nyc/ll84_Y_2016.npy")
    valClassVals = np.load("output/nyc/ll84_classVals_2016.npy")
    return X_val, Y_val, valClassVals

# Load models from file

In [4]:
regressors = []
regressorNames = []

for fn in os.listdir("output/trainedModels/"):
    print fn

    regressor = pickle.load(open("output/trainedModels/%s" % (fn), "rb"))
    regressorName = fn.split("_")[0]
    
    regressors.append(regressor)
    regressorNames.append(regressorName)

numRegressors = len(regressors)

KNN Regressor_trained.p
Linear Regression_trained.p
XGBoost_trained.p
Bagging_trained.p
Linear SVR_trained.p
Extra Trees Regressor_trained.p
Random Forest Regressor_trained.p
ElasticNet_trained.p
AdaBoost_trained.p
SVR_trained.p
MLP Regressor_trained.p
Lasso_trained.p
Ridge Regressor_trained.p


In [5]:
scaler = pickle.load(open("output/scaler.p", "rb"))

# Run validation

In [6]:
X_val, Y_val, valClassVals = load_NYC()
X_val_scaled = scaler.transform(X_val)

In [7]:
for i in range(numRegressors):
    regressor = regressors[i]
    regressorName = regressorNames[i]
    
    Y_pred = regressor.predict(X_val_scaled)
    Y_pred[Y_pred<0] = 0
    
    scores = []
    for j in range(numMetrics):
        score = metrics[j](Y_val, Y_pred)
        scores.append(score)
        
    print regressorName
    for j in range(numMetrics):
        print "\t%s - %0.4f" % (metricNames[j], scores[j])

KNN Regressor
	Mean Absolute Error - 0.3287
	Median Absolute Error - 0.2063
	R2 - 0.3236
Linear Regression
	Mean Absolute Error - 0.6613
	Median Absolute Error - 0.5563
	R2 - -0.8642
XGBoost
	Mean Absolute Error - 0.2498
	Median Absolute Error - 0.1507
	R2 - 0.5146
Bagging
	Mean Absolute Error - 0.2678
	Median Absolute Error - 0.1696
	R2 - 0.4743
Linear SVR
	Mean Absolute Error - 0.6600
	Median Absolute Error - 0.5256
	R2 - -1.1203
Extra Trees Regressor
	Mean Absolute Error - 0.2756
	Median Absolute Error - 0.1724
	R2 - 0.4604
Random Forest Regressor
	Mean Absolute Error - 0.2658
	Median Absolute Error - 0.1665
	R2 - 0.4733
ElasticNet
	Mean Absolute Error - 0.9842
	Median Absolute Error - 0.9346
	R2 - -2.0461
AdaBoost
	Mean Absolute Error - 0.3720
	Median Absolute Error - 0.2998
	R2 - 0.3070
SVR
	Mean Absolute Error - 0.3585
	Median Absolute Error - 0.2072
	R2 - 0.1868
MLP Regressor
	Mean Absolute Error - 0.2947
	Median Absolute Error - 0.1635
	R2 - 0.3336
Lasso
	Mean Absolute Error - 