# Train models on CBECS

In this notebook we train a model using each model with the "common feature" dataset, then save the trained model to disk to be applied elsewhere.

In [1]:
%matplotlib inline
import sys
import os
import time

import pandas as pd
import numpy as np

import cPickle as pickle

import CBECSLib

#sklearn base
import sklearn.base

#sklearn utility
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [2]:
OUTPUT_DIR = "output/trainedModels/"

In [3]:
pbaLabels = CBECSLib.pbaLabels
pbaPlusLabels = CBECSLib.pbaPlusLabels

getDataset = CBECSLib.getDataset
getClassFrequencies = CBECSLib.getClassFrequencies
getDataSubset = CBECSLib.getDataSubset

In [4]:
regressors = CBECSLib.regressors
regressorNames = CBECSLib.regressorNames
numRegressors = CBECSLib.numRegressors

metrics = CBECSLib.metrics
metricNames = CBECSLib.metricNames
numMetrics = CBECSLib.numMetrics

## Create regression models and save to disk

In [5]:
X,Y,columnNames,classVals = getDataset(1,pbaOneHot=True)
print columnNames
classOrdering,classFrequencies = getClassFrequencies(classVals)
numClassVals = len(classFrequencies)
Y = np.log10(Y)

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
pickle.dump(scaler, open("output/scaler.p", "wb"))

for i in range(numRegressors):
    regressor = sklearn.base.clone(regressors[i])
    regressorName = regressorNames[i]

    print regressorName
    
    #train model
    regressor.fit(X_scaled,Y)

    #predict model
    predicted = regressor.predict(X_scaled)
    predicted[predicted<0] = 0

    #evaluate model
    scores = []
    for m,metric in enumerate(metrics):
        metricName = metricNames[m]
        score = metric(Y,predicted)
        scores.append(score)
    print scores
        
    pickle.dump(regressor, open(os.path.join(OUTPUT_DIR, "%s_trained.p" % (regressorName)), "wb"))

20 classes
['SQFT' 'CDD65' 'HDD65' 'NFLOOR' 'PBA 1' 'PBA 2' 'PBA 4' 'PBA 5' 'PBA 6'
 'PBA 7' 'PBA 8' 'PBA 11' 'PBA 12' 'PBA 13' 'PBA 14' 'PBA 15' 'PBA 16'
 'PBA 17' 'PBA 18' 'PBA 23' 'PBA 24' 'PBA 25' 'PBA 26' 'PBA 91']
Linear Regression
[0.51971655364473335, 3.309150761689839, 0.43036557994203406, 2.6938014350676145, 0.53846385539177355]
Ridge Regressor
[0.51963978624712637, 3.3085658764285073, 0.42993141750412622, 2.6911097982961314, 0.53846473730738165]
SVR
[0.37184666793365806, 2.3542179566520107, 0.27371489175351105, 1.8780834765063645, 0.72203247138834037]
Lasso
[0.78991319184111575, 6.1647176721297914, 0.68863716581377776, 4.8824428199896053, 0.0]
ElasticNet
[0.75934058596935938, 5.7456687710419789, 0.67488148136559722, 4.7302215395924767, 0.09159617864103442]
Linear SVR
[0.51112992723713746, 3.2443666406839435, 0.39916825166096981, 2.5070803423514318, 0.52250078265606892]
AdaBoost
[0.44594937775692328, 2.7922183556498887, 0.37531549844048584, 2.3730970444549571, 0.6710984992377