In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install scikit-learn==0.24

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
# regression model
from sklearn.ensemble import RandomForestRegressor
# metrics packages
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, mean_squared_error

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

# functions

In [None]:
def hyper_parameter_search(data, X, baseline_start, baseline_end, regressor):
  # get date range for baseline and select data
  time_base = pd.date_range(start=baseline_start, end=baseline_end, freq='2MS')
  data_base = data[time_base]

  # get climate data for baseline and stressor
  X_base = X.loc[time_base]

  # initialize best params matrix
  best_params = pd.DataFrame(index=data_base.index, columns=['params','best_score', 'mape', 'rmse'])

  # normalize data
  X_base_scaled = scaler.fit_transform(X_base)

  # predict consumption during the stressor for each account by Random Forrest Regressor
  for i, row in data_base.iterrows():
    y = row.array
    model = regressor.fit(X_base_scaled, y)
    y_pred = model.predict(X_base_scaled)
    mape = mean_absolute_percentage_error(y_true=y, y_pred=y_pred)
    rmse = mean_squared_error(y_true=y, y_pred=y_pred, squared=False)

    best_params.loc[i] = (model.best_params_, model.best_score_,mape, rmse)

  return best_params

In [None]:
def run_model(data, climate_data, baseline_start, baseline_end, pred_start, pred_end,  best_params):
  # get date range for baseline and select data
  time_base = pd.date_range(start=baseline_start, end=baseline_end, freq='2MS')
  data_base = data[time_base]

  # get date range for prediction and select data
  time_pred = pd.date_range(start=pred_start, end=pred_end, freq='2MS')
  data_pred = data[time_pred]

  # get climate data for baseline and stressor
  # initialize scaler to normalize features
  scaler = MinMaxScaler()
  X_base = climate_data.loc[time_base]
  X_base_scaled = scaler.fit_transform(X_base)

  X_pred = climate_data.loc[time_pred]
  X_pred_scaled = scaler.fit_transform(X_pred)
 

  # initialize quantile predictions
  quantiles = [0.1, 0.2, 0.5, 0.8, 0.9]
  prediction_rf = pd.DataFrame(index=data_base.index, columns=['params', 'q0.1','q0.2', 'q0.5', 'q0.8', 'q0.9', 'y_true', 'date'])

  # predict consumption during the stressor for each account by Random Forrest Regressor
  for i, row in data_base.iterrows():
    y = row.to_numpy()
    params = best_params['params'].loc[i]
    regressor = RandomForestRegressor(**params)
    regressor.fit(X_base_scaled, y)
    preds = [rf_quantile(regressor, X_pred_scaled, q) for q in quantiles]
    prediction_rf.loc[i] = (params, preds[0], preds[1], preds[2], preds[3], preds[4], data_pred.loc[i], time_base)


  return prediction_rf

In [None]:
def rf_quantile(m, X, q):
    rf_preds = []
    for estimator in m.estimators_:
        rf_preds.append(estimator.predict(X))
    rf_preds = np.array(rf_preds).transpose()  # One row per record.
    return np.percentile(rf_preds, q * 100, axis=1)

# data loading and preprocessing


In [None]:
# load  climatic and employment features
features = pd.read_csv('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/climate_economic_features' , index_col=0)

In [None]:
# process features
features.index= pd.to_datetime(features.index)
features['month'] = features.index.month_name()
# create binary variable for month
X = pd.get_dummies(features, columns = ['month'])

# Regression

In [None]:
groups = ['SB', 'SD', 'MB', 'MD']

for i in groups:
  print(i)
  consumption_data = pd.read_pickle('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/Consumption_Matrices/{}_accounts_actual_usage'.format(i))
  best_params = pd.read_pickle('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/HyperparameterSearch/Hyperparams_{}/best_params_{}'.format(i,i))
  #pred = run_model(consumption_data, X, '1/1/2002', '11/1/2007','1/1/2008', '7/1/2020',  best_params)
  #pred.to_pickle('/content/drive/MyDrive/Stanford-TUBerlin/CodePaper/RegressionModel/pred_{}'.format(i))
  print(best_params.rmse.min(), best_params.rmse.max(), best_params.rmse.mean())


SB
4.283417534439648 605.0798647496375 65.68149546999275
SD
2.758927733029219 341.82421807041715 32.57791641956227
MB
8.715431339652776 2986.20487608477 161.68665257814473
MD
13.066729599256659 2688.4886489518567 465.69928042273887
