# NYC - Run experiments

In this notebook we run the experiments that are shown in Table 7.

Note that the results shown in the first row of Table 7 are generated in `NYC - Validation.ipynb`.

In [1]:
%matplotlib inline
import sys
import os
import time

import pandas as pd
import numpy as np

import CBECSLib

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')

#sklearn base
import sklearn.base

#sklearn utility
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

#IPython utilities
from IPython.display import HTML,display
def show(html):
    display(HTML(html))

In [2]:
RESULTS_DIR = "results/" # where output is written to
DATASET = "VAL" # for the extended set of features use 0, for the common set of features use 1

In [3]:
def load_NYC():
    X_val = np.load("output/nyc/ll84_X_2016.npy")
    Y_val = np.load("output/nyc/ll84_Y_2016.npy")
    valClassVals = np.load("output/nyc/ll84_classVals_2016.npy")
    return X_val, Y_val, valClassVals

In [4]:
pbaLabels = CBECSLib.pbaLabels
pbaPlusLabels = CBECSLib.pbaPlusLabels

getClassFrequencies = CBECSLib.getClassFrequencies
getDataSubset = CBECSLib.getDataSubset

## Create Regression Models

In [5]:
regressors = CBECSLib.regressors
regressorNames = CBECSLib.regressorNames
numRegressors = CBECSLib.numRegressors

metrics = CBECSLib.metrics
metricNames = CBECSLib.metricNames
numMetrics = CBECSLib.numMetrics

# Experiments Training on All Data, Testing on All Data

In [6]:
X,Y,classVals = load_NYC()
classOrdering,classFrequencies = getClassFrequencies(classVals)
numClassVals = len(classFrequencies)

numSplits = 3
numRepeats = 10
outputFn = "test_all_VAL"

results = np.zeros((numRepeats, numSplits, numRegressors, numMetrics), dtype=float)

for i in range(numRepeats):
    print "Repetition %d" % (i)

    kf = StratifiedKFold(n_splits=numSplits)
    for j, (train, test) in enumerate(kf.split(X,classVals)):
        #print "\tSplit %d" % (j)
        X_train, X_test = X[train,:], X[test,:]
        Y_train, Y_test = Y[train], Y[test]
        classVals_train, classVals_test = classVals[train].copy(), classVals[test].copy()

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        for k in range(numRegressors):
            regressor = sklearn.base.clone(regressors[k])
            regressorName = regressorNames[k]

            #train model
            regressor.fit(X_train,Y_train)

            #predict model
            predicted = regressor.predict(X_test)
            predicted[predicted<0] = 0

            #evaluate model
            for m,metric in enumerate(metrics):
                metricName = metricNames[m]
                score = metric(Y_test,predicted)
                results[i,j,k,m] = score
        
results = np.array(results)

results = results.reshape(-1, numRegressors, numMetrics)

classNames = [pbaLabels[pbaLabel] for pbaLabel in classOrdering]

meanResults = results.mean(axis=0)
meanResultTable = pd.DataFrame(meanResults, index=regressorNames, columns=metricNames)
meanResultTable.to_csv(os.path.join(RESULTS_DIR, "%s_means.csv" % (outputFn)))

stdResults = results.std(axis=0)
stdResultTable = pd.DataFrame(stdResults, index=regressorNames, columns=metricNames)
stdResultTable.to_csv(os.path.join(RESULTS_DIR, "%s_stds.csv" % (outputFn)))

formattedResults = []
for i in range(numRegressors):
    row = []
    for j in range(numMetrics):
        row.append("%0.2f +/- %0.2f" % (meanResults[i,j], stdResults[i,j]))
    formattedResults.append(row)
formattedResults = np.array(formattedResults)
formattedResultsTable = pd.DataFrame(formattedResults, index=regressorNames, columns=metricNames)
formattedResultsTable.to_csv(os.path.join(RESULTS_DIR, "%s_formatted.csv" % (outputFn)))

display(formattedResultsTable)

Repetition 0
Repetition 1
Repetition 2




Repetition 3
Repetition 4
Repetition 5
Repetition 6
Repetition 7
Repetition 8
Repetition 9


Unnamed: 0,Mean Absolute Error,10^Mean AE,Median Absolute Error,10^Median AE,$r^2$
Linear Regression,0.29 +/- 0.02,1.95 +/- 0.10,0.19 +/- 0.01,1.56 +/- 0.05,0.44 +/- 0.08
Ridge Regressor,0.29 +/- 0.02,1.95 +/- 0.10,0.19 +/- 0.01,1.55 +/- 0.04,0.44 +/- 0.08
SVR,0.25 +/- 0.02,1.77 +/- 0.10,0.15 +/- 0.01,1.40 +/- 0.03,0.51 +/- 0.11
Lasso,0.45 +/- 0.01,2.80 +/- 0.04,0.33 +/- 0.01,2.13 +/- 0.06,-0.01 +/- 0.00
ElasticNet,0.45 +/- 0.01,2.80 +/- 0.04,0.33 +/- 0.01,2.13 +/- 0.06,-0.01 +/- 0.00
Linear SVR,0.28 +/- 0.02,1.92 +/- 0.07,0.17 +/- 0.00,1.50 +/- 0.01,0.42 +/- 0.05
AdaBoost,0.42 +/- 0.08,2.69 +/- 0.49,0.30 +/- 0.05,2.03 +/- 0.26,0.13 +/- 0.24
Bagging,0.29 +/- 0.02,1.95 +/- 0.09,0.18 +/- 0.02,1.50 +/- 0.05,0.42 +/- 0.08
XGBoost,0.24 +/- 0.02,1.76 +/- 0.09,0.15 +/- 0.01,1.40 +/- 0.03,0.53 +/- 0.09
Random Forest Regressor,0.29 +/- 0.02,1.95 +/- 0.10,0.18 +/- 0.01,1.50 +/- 0.05,0.42 +/- 0.08
