# NYC - Validation

In this notebook we run our models that have been trained on CBECS on the PLUTO/LL84 validation dataset.

This reproduces the result shown in the first row of Table 7.

These results do not require cross validation as we use models trained on _all_ of CBECS tested on _all_ of the validation dataset.

In [1]:
%matplotlib inline
import sys
import os
import time

import cPickle as pickle

import pandas as pd
import numpy as np

import CBECSLib

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')

#sklearn Utility Functions
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score

In [2]:
metrics = CBECSLib.metrics
metricNames = CBECSLib.metricNames
numMetrics = CBECSLib.numMetrics

# Train/Test Models with NYC Data

In [3]:
def load_NYC():
    X_val = np.load("output/nyc/ll84_X_2016.npy")
    Y_val = np.load("output/nyc/ll84_Y_2016.npy")
    valClassVals = np.load("output/nyc/ll84_classVals_2016.npy")
    return X_val, Y_val, valClassVals

# Load models from file

In [4]:
regressors = []
regressorNames = []

for fn in os.listdir("output/trainedModels/"):
    print fn

    regressor = pickle.load(open("output/trainedModels/%s" % (fn), "rb"))
    regressorName = fn.split("_")[0]
    
    regressors.append(regressor)
    regressorNames.append(regressorName)

numRegressors = len(regressors)

KNN Regressor_trained.p
Linear Regression_trained.p
XGBoost_trained.p
Bagging_trained.p
Linear SVR_trained.p
Extra Trees Regressor_trained.p
Random Forest Regressor_trained.p
ElasticNet_trained.p
AdaBoost_trained.p
SVR_trained.p
MLP Regressor_trained.p
Lasso_trained.p
Ridge Regressor_trained.p


In [5]:
scaler = pickle.load(open("output/scaler.p", "rb"))

# Run validation

In [6]:
X_val, Y_val, valClassVals = load_NYC()
X_val_scaled = scaler.transform(X_val)

In [7]:
for i in range(numRegressors):
    regressor = regressors[i]
    regressorName = regressorNames[i]
    
    Y_pred = regressor.predict(X_val_scaled)
    Y_pred[Y_pred<0] = 0
    
    scores = []
    for j in range(numMetrics):
        score = metrics[j](Y_val, Y_pred)
        scores.append(score)
        
    print regressorName
    for j in range(numMetrics):
        print "\t%s - %0.4f" % (metricNames[j], scores[j])

KNN Regressor
	Mean Absolute Error - 0.3270
	10^Mean AE - 2.1232
	Median Absolute Error - 0.2065
	10^Median AE - 1.6087
	$r^2$ - 0.3308
Linear Regression
	Mean Absolute Error - 0.6596
	10^Mean AE - 4.5669
	Median Absolute Error - 0.5538
	10^Median AE - 3.5794
	$r^2$ - -0.8592
XGBoost
	Mean Absolute Error - 0.2497
	10^Mean AE - 1.7772
	Median Absolute Error - 0.1505
	10^Median AE - 1.4140
	$r^2$ - 0.5147
Bagging
	Mean Absolute Error - 0.2701
	10^Mean AE - 1.8627
	Median Absolute Error - 0.1703
	10^Median AE - 1.4801
	$r^2$ - 0.4635
Linear SVR
	Mean Absolute Error - 0.6613
	10^Mean AE - 4.5848
	Median Absolute Error - 0.5252
	10^Median AE - 3.3514
	$r^2$ - -1.1295
Extra Trees Regressor
	Mean Absolute Error - 0.2753
	10^Mean AE - 1.8847
	Median Absolute Error - 0.1713
	10^Median AE - 1.4835
	$r^2$ - 0.4588
Random Forest Regressor
	Mean Absolute Error - 0.2664
	10^Mean AE - 1.8467
	Median Absolute Error - 0.1681
	10^Median AE - 1.4727
	$r^2$ - 0.4713
ElasticNet
	Mean Absolute Error - 0.984