In [1]:
# Import libraries to use
import pandas as pd
import numpy as np
import os
import re
import random
import datetime
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from dateutil import parser
from tpot import TPOTRegressor
import tensorflow as tf
import feather
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.tree import export_graphviz

# Model Evaluation

In [2]:
# Directory with feather files
root_dir = 'feather/train_test/'
building_names = ['APS', 'CoServ', 'Kansas', 'NVE', 'PGE1', 'SDGE', 'SMUD', 'SRP']

# Create the three models with minimal hyperparameter selection
rf = RandomForestRegressor(n_estimators = 200, max_depth = 40, criterion = 'mse', n_jobs = -1, verbose = 1)
lr = LinearRegression()
svr = SVR(kernel = 'rbf', verbose = True)

# Need to store RF feature importances
feature_importances = pd.DataFrame(columns = building_names)

# Iterate through all the buildings
for building in building_names:
    
    # Create a dataframe to hold the predictions
    predictions = pd.DataFrame(columns = ['rf', 'lr', 'svr'])
    
    # Read in the training/testing sets
    X_train = np.array(feather.read_dataframe(root_dir + '%s_X_train.feather' % building))
    X_test = np.array(feather.read_dataframe(root_dir + '%s_X_test.feather' % building))
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['forecast'])
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['forecast'])
    
    # Fit (train) the models and make predictions on the test data
    rf.fit(X_train, y_train)
    rf_predictions = rf.predict(X_test)
    
    # Save features importances of the random forest
    building_feature_importances = rf.feature_importances_
    feature_importances[building] = building_feature_importances
    
    lr.fit(X_train, y_train)
    lr_predictions = lr.predict(X_test)
    
    svr.fit(X_train, y_train)
    svr_predictions = svr.predict(X_test)
    
    # Save the predictions to a dataframe
    predictions['rf'] = rf_predictions
    predictions['lr'] = lr_predictions
    predictions['svr'] = svr_predictions
    
    # Write the predictions 
    feather.write_dataframe(predictions, 'feather/predictions/' + '%s_predictions.feather' % building)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.9s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.7s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   19.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   51.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   53.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.4s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   19.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   25.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.9s finished


[LibSVM]

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   52.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.4s finished


[LibSVM]

### Write Feature Importances

In [3]:
# Write the feature importances to a feather file for analysis
feather.write_dataframe(feature_importances, 'feather/feature_importances/feature_importances.feather')

# Grid Search for Random Forest Optimization

In [2]:
# Perform grid search to find best parameters
# Feature selection, depth, number of trees, and minimum size of leaves
param_grid = {
    'max_features': [None, 0.5], 
    'max_depth': [None, 40],
    'n_estimators':[200, 400],
    'min_samples_leaf': [1, 4]
}

# Testing on a dataset with a high mape to improve performance
nve_X_train = feather.read_dataframe('feather/train_test/NVE_X_train.feather')
nve_X_train = np.array(nve_X_train)

nve_X_test = feather.read_dataframe('feather/train_test/NVE_X_test.feather')
nve_X_test = np.array(nve_X_test)

nve_y_train = feather.read_dataframe('feather/train_test/NVE_y_train.feather')
nve_y_train = np.array(nve_y_train['forecast'])

nve_y_test = feather.read_dataframe('feather/train_test/NVE_y_test.feather')
nve_y_test = np.array(nve_y_test['forecast'])

rf_test = RandomForestRegressor(bootstrap = True, verbose = 2, n_jobs = -1)

rf_grid = GridSearchCV(rf_test, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, verbose = 2)

In [None]:
rf_grid.fit(nve_X_train, nve_y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 40.4min


In [8]:
rf_grid.best_params_

{'max_depth': None,
 'max_features': 0.5,
 'min_samples_leaf': 1,
 'n_estimators': 200}

## Implement Best Grid Search Parameters to Assess Performance

In [4]:
# Implement best grid search results

# Directory with feather files
root_dir = 'feather/train_test/'
building_names = ['APS', 'CoServ', 'Kansas', 'NVE', 'PGE1', 'SDGE', 'SMUD', 'SRP']

# Create the three models with optimized hyperparameters
rf = RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_leaf = 1, max_features = 0.5,
                           criterion = 'mse', n_jobs = -1, verbose = 1)

# Need to store RF feature importances
feature_importances = pd.DataFrame(columns = building_names)

# Iterate through all the buildings
for building in building_names:
    
    # Create a dataframe to hold the predictions
    predictions = pd.DataFrame(columns = ['rf'])
    
    # Read in the training/testing sets
    X_train = np.array(feather.read_dataframe(root_dir + '%s_X_train.feather' % building))
    X_test = np.array(feather.read_dataframe(root_dir + '%s_X_test.feather' % building))
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['forecast'])
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['forecast'])
    
    # Fit (train) the models and make predictions on the test data
    rf.fit(X_train, y_train)
    rf_predictions = rf.predict(X_test)
    
    # Save features importances of the random forest
    building_feature_importances = rf.feature_importances_
    feature_importances[building] = building_feature_importances
    
    
    # Save the predictions to a dataframe
    predictions['rf'] = rf_predictions
    
    # Write the predictions 
    feather.write_dataframe(predictions, 'feather/predictions/' + '%s_optimized_predictions.feather' % building)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   42.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   39.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   10.0s finished
[Parallel(n_jobs=4)]: Done  42 

In [5]:
# Write the feature importances to a feather file for analysis
feather.write_dataframe(feature_importances, 'feather/feature_importances/optimized_feature_importances.feather')

# Challenge Model

Predict last six months. 

In [3]:
root_dir = 'feather/challenge_predictions/'
building_names = ['APS', 'CoServ', 'Kansas', 'NVE', 'PGE1', 'SDGE', 'SMUD', 'SRP']

                  
# Create the three models with minimal hyperparameter selection
rf = RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_leaf = 1, max_features = 0.5,
                           criterion = 'mse', n_jobs = -1, verbose = 1)

# Need to store RF feature importances
feature_importances = {}

# Iterate through all the buildings
for building in building_names:
    
    # Create a dataframe to hold the predictions
    predictions = pd.DataFrame()
    
    # Read in the training/testing sets
    X_train = np.array(feather.read_dataframe(root_dir + '%s_X_train.feather' % building))
    X_test = np.array(feather.read_dataframe(root_dir + '%s_X_test.feather' % building))
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['forecast'])
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['forecast'])
    
    
    # Fit (train) the models and make predictions on the test data
    rf.fit(X_train, y_train)
    rf_predictions = rf.predict(X_test)
    
    # Save features importances of the random forest
    building_feature_importances = rf.feature_importances_
    feature_importances[building] = building_feature_importances
    
    # Save the predictions to a dataframe
    predictions['predictions'] = rf_predictions
    
    # Write the predictions to a feather file for moving back to r for analysis
    feather.write_dataframe(predictions, 'feather/challenge_results/%s_preds.feather' % building)
    print(building)
    

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   52.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.3s finished


APS


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   44.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


CoServ


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


Kansas


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   39.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


NVE


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


PGE1


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    8.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


SDGE


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   49.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.3s finished


SMUD


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   25.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


SRP


# Predicting Last 4 Months

# Predicting Energy Consumption (Forecasting)

In [13]:
root_dir = 'feather/months/'
building_names = ['APS', 'CoServ', 'Kansas', 'NVE', 'PGE1', 'SDGE', 'SMUD', 'SRP']

                  
# Create the three models with minimal hyperparameter selection
rf = RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_leaf = 1, max_features = 0.5,
                           criterion = 'mse', n_jobs = -1, verbose = 1)

# Need to store RF feature importances
feature_importances = pd.DataFrame(columns = building_names)

# Iterate through all the buildings
for building in building_names:
    
    # Create a dataframe to hold the predictions
    predictions = pd.DataFrame()
    
    # Read in the training/testing sets
    X_train = np.array(feather.read_dataframe(root_dir + '%s_X_train.feather' % building))
    X_test = np.array(feather.read_dataframe(root_dir + '%s_X_test.feather' % building))
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['cleaned_energy'])
    y_train = np.array(feather.read_dataframe(root_dir + '%s_y_train.feather' % building)['cleaned_energy'])
    
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Fit (train) the models and make predictions on the test data
    rf.fit(X_train, y_train)
    rf_predictions = rf.predict(X_test)
    
    # Save features importances of the random forest
    building_feature_importances = rf.feature_importances_
    feature_importances[building] = building_feature_importances
    
    # Save the predictions to a dataframe
    predictions['predictions'] = rf_predictions
    
    # Write the predictions to a feather file for moving back to r for analysis
    feather.write_dataframe(predictions, 'feather/predictions/%s_month_predictions.feather' % building)
    print(building)
    
# Write the feature importances to a feather file for analysis
feather.write_dataframe(feature_importances, 'feather/feature_importances/month_feature_importances.feather')

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


APS


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


CoServ


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   25.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


Kansas


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


NVE


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 15.4min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.2s finished


PGE1


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   39.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


SDGE


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.3s finished


SMUD


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   57.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.1s finished


SRP


# Increased Temp Prediction

In [2]:
root_dir = 'feather/increased/'
building_names = ['APS', 'CoServ', 'Kansas', 'NVE', 'PGE1', 'SDGE', 'SMUD', 'SRP']
   
# Create the three models with optimized hyperparameter selection
rf = RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_leaf = 1, max_features = 0.5,
                           criterion = 'mse', n_jobs = -1, verbose = 1)

# Need to store RF feature importances
feature_importances = pd.DataFrame(columns = building_names)

# Iterate through all the buildings
for building in building_names:
    
    # Create a dataframe to hold the predictions
    predictions = pd.DataFrame()
    
    # Read in the training/testing sets
    train = np.array(feather.read_dataframe(root_dir + '%s_train.feather' % building))
    labels = np.array(feather.read_dataframe(root_dir + '%s_labels.feather' % building)['forecast'])
    test = np.array(feather.read_dataframe(root_dir + '%s_test.feather' % building))
    
    # Scale the data 
    scaler = StandardScaler()
    
    # Scaler is fit on train data and run on testing data
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    
    # Fit (train) the models and make predictions on the test data
    rf.fit(train, labels)
    rf_predictions = rf.predict(test)
    
    # Save features importances of the random forest
    building_feature_importances = rf.feature_importances_
    feature_importances[building] = building_feature_importances
    
    # Save the predictions to a dataframe
    predictions['predictions'] = rf_predictions
    
    # Write the predictions to a feather file for moving back to r for analysis
    feather.write_dataframe(predictions, 'feather/predictions/%s_increased_predictions.feather' % building)
    print(building)
    
# Write the feature importances to a feather file for analysis
feather.write_dataframe(feature_importances, 'feather/feature_importances/increased_feature_importances.feather')

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    5.2s finished


APS


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    4.6s finished


CoServ


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   49.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.3s finished


Kansas


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    2.9s finished


NVE


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   42.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.2s finished


PGE1


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   53.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    2.7s finished


SDGE


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    5.8s finished


SMUD


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    4.3s finished


SRP


# Visualizing One Decision Tree

In [2]:
smud_X_train = np.array(feather.read_dataframe('feather/increased/SMUD_train.feather'))
smud_y_train = np.array(feather.read_dataframe('feather/increased/SMUD_labels.feather')['forecast'])

rf = RandomForestRegressor(n_estimators = 10, max_depth = 3, verbose = 2, min_samples_split = 50)

rf.fit(smud_X_train, smud_y_train)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.4s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=50,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=2, warm_start=False)

In [3]:
vis_tree = rf.estimators_[5]

In [4]:
vis_tree
smud_features = list(feather.read_dataframe('feather/increased/SMUD_train.feather'))

In [5]:
export_graphviz(vis_tree, out_file = 'treevis.dot', feature_names = smud_features, rounded = True)

In [6]:
import pydot

(graph,) = pydot.graph_from_dot_file('treevis.dot')
graph.write_png('treevis_verysmall.png')

True

In [19]:
from subprocess import check_call
check_call(['dot','-Tpng','treevis.dot','-o','treevis.png'])

0

In [4]:
rf = RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_leaf = 1, max_features = 0.5,
                           criterion = 'mse', n_jobs = -1, verbose = 1)

In [5]:
rf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
           oob_score=False, random_state=None, verbose=1, warm_start=False)