# Train models on CBECS

In this notebook we train a model using each model with the "common feature" dataset, then save the trained model to disk to be applied elsewhere.

In [1]:
%matplotlib inline
import sys
import os
import time

import pandas as pd
import numpy as np

import cPickle as pickle

import CBECSLib

import itertools
from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')

#sklearn base
import sklearn.base

#sklearn utility
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

#sklearn models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

In [2]:
OUTPUT_DIR = "output/trainedModels/"

In [3]:
pbaLabels = CBECSLib.pbaLabels
pbaPlusLabels = CBECSLib.pbaPlusLabels

getDataset = CBECSLib.getDataset
getClassFrequencies = CBECSLib.getClassFrequencies
getDataSubset = CBECSLib.getDataSubset

In [4]:
regressors = [
    LinearRegression(n_jobs=-1),
    Ridge(),
    SVR(),
    Lasso(),
    ElasticNet(),
    LinearSVR(verbose=0),
    AdaBoostRegressor(),
    BaggingRegressor(n_jobs=-1),
    GradientBoostingRegressor(verbose=0),
    RandomForestRegressor(n_jobs=-1, verbose=0),
    ExtraTreesRegressor(n_jobs=-1, verbose=0),
    MLPRegressor(),
    KNeighborsRegressor()    
]
regressorNames = [
    "Linear Regression",
    "Ridge Regressor",
    "SVR",
    "Lasso",
    "ElasticNet",
    "Linear SVR",
    "AdaBoost",
    "Bagging",
    "XGBoost",
    "Random Forest Regressor",
    "Extra Trees Regressor",
    "MLP Regressor",
    "KNN Regressor"
]
assert len(regressors) == len(regressorNames)
numRegressors = len(regressors)

metrics = [
    mean_absolute_error,
    median_absolute_error,
    r2_score
]
metricNames = [
    "Mean Absolute Error",
    "Median Absolute Error",
    "$r^2$"
]
assert len(metrics) == len(metricNames)
numMetrics = len(metrics)

## Create regression models and save to disk

In [6]:
X,Y,columnNames,classVals = getDataset(1,pbaOneHot=True)
print columnNames
classOrdering,classFrequencies = getClassFrequencies(classVals)
numClassVals = len(classFrequencies)
Y = np.log10(Y)

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
pickle.dump(scaler, open("output/scaler.p", "wb"))

for i in range(numRegressors):
    regressor = sklearn.base.clone(regressors[i])
    regressorName = regressorNames[i]

    print regressorName
    
    #train model
    regressor.fit(X_scaled,Y)

    #predict model
    predicted = regressor.predict(X_scaled)
    predicted[predicted<0] = 0

    #evaluate model
    scores = []
    for m,metric in enumerate(metrics):
        metricName = metricNames[m]
        score = metric(Y,predicted)
        scores.append(score)
    print scores
        
    pickle.dump(regressor, open(os.path.join(OUTPUT_DIR, "%s_trained.p" % (regressorName)), "wb"))

20 classes
['SQFT' 'CDD65' 'HDD65' 'NFLOOR' 'PBA 1' 'PBA 2' 'PBA 4' 'PBA 5' 'PBA 6'
 'PBA 7' 'PBA 8' 'PBA 11' 'PBA 12' 'PBA 13' 'PBA 14' 'PBA 15' 'PBA 16'
 'PBA 17' 'PBA 18' 'PBA 23' 'PBA 24' 'PBA 25' 'PBA 26' 'PBA 91']
Linear Regression
[0.51971655364473335, 0.43036557994203406, 0.53846385539177355]
Ridge Regressor
[0.51963978624712637, 0.42993141750412622, 0.53846473730738165]
SVR
[0.37184666793365806, 0.27371489175351105, 0.72203247138834037]
Lasso
[0.78991319184111575, 0.68863716581377776, 0.0]
ElasticNet
[0.75934058596935938, 0.67488148136559722, 0.09159617864103442]
Linear SVR
[0.51107883326645631, 0.40115581898317831, 0.52337611975820342]
AdaBoost
[0.4260757246434117, 0.35264559072645252, 0.69808947298507174]
Bagging
[0.12735688552502897, 0.084665878478815237, 0.96230346612125273]
XGBoost
[0.28082793107345938, 0.20933714692904815, 0.84102974263036745]
Random Forest Regressor
[0.1286557187819253, 0.084340734999310385, 0.96005531274259925]
Extra Trees Regressor
[2.9435790489464167