# Demand forecasting based on weather

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

Datapath = "../Data/"

# Extracting
I use the pickle files that have already been preprocessed "Assignment 4 ETL.ipynb" and the fitted model from "Assignment 4 Weather prediction.ipynb" 

In [15]:
df_train = pd.read_pickle(Datapath+"df_train.p")
df_test = pd.read_pickle(Datapath+"df_test.p")

# Preprocessing
The goal is to predict today's and tomorrow's demand based on the weather prediction of today and tomorrow. That means our predictive model requires 4 input vectors:

 1. Predicted Temp for today
 2. Predicted Rainfall for today
 3. Predicted Temp for tomorrow
 4. Predicted Rainfall for tomorrow
 
And one output value:
 
 The predicted demand.
 
I prepare input vectors X and output values y that reflect these requirements.

In [9]:
vals_train = df_train[['Temp', 'Rainfall']].values
X_train = np.array([list(vals_train[i]) + list(vals_train[i+1]) for i in range(len(vals_train)-1)])
# We can not predict for the final day as we do not have the weather prediction for the day after.
y_train = np.array(df_train['Demand'].values)[:-1]

vals_test = df_test[['Temp', 'Rainfall']].values
X_test = np.array([list(vals_test[i]) + list(vals_test[i+1]) for i in range(len(vals_test)-1)])
# We can not predict for the final day as we do not have the weather prediction for the day after.
y_test = np.array(df_test['Demand'].values)[:-1]

We do some quick testing on Random forests...

In [10]:
m = RFR(n_estimators=80, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
m.score(X_test, y_test)

0.6900477557496679

Linear regression...

In [11]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.4078725407039579

Logistic regression...

In [12]:
log = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
log.score(X_test, y_test)

0.3983516483516483

It seems that a random forest performs best, this is actually quite common so I'll do a gridsearch in order to tune the hyperparemters. We check a variety of parameters and use 5-fold validation on the combined data set from 2014 to 2017

In [14]:
X_cv = np.concatenate((X_train, X_test))
y_cv = np.concatenate((y_train, y_test))

Warning!!! The next cell (gridsearch) takes almost 40 minutes to execute.

In [77]:
parameters = {'n_estimators' : list(range(5, 101, 5)), 'min_samples_split' : list(range(2, 10)), 
              'min_samples_leaf' : list(range(2, 10))}
m = RFR(n_jobs=-1)
clf = GridSearchCV(m, parameters, cv=5)
clf.fit(X_cv, y_cv)
m = clf.best_estimator_

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [88]:
clf.best_score_

0.6591514824573875

The model so it can be used later without doing the expensive gridsearch

In [89]:
dump(m, Datapath+"optimal_m.joblib")

['../Data/fitted_m.joblib']

In [94]:
m =  load(Datapath+"optimal_m.joblib")

In [95]:
m.fit(X_train, y_train)
m.score(X_test, y_test)

0.680327471924683

So the model performs with an R2 of about 0.68 which is not great but definitely not bad either.

Let's save it for future use.

In [96]:
dump(m, Datapath+"fitted_m.joblib")

['../Data/fitted_m.joblib']

In [None]:
m.predi