In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
history = pd.read_csv("new_history.csv")

In [3]:
history.head()

Unnamed: 0,Hour,Temperature,Load
0,1,43.72,1384494.0
1,2,42.72,1392822.0
2,3,41.84,1407887.0
3,4,41.04,1438658.0
4,5,40.56,1484046.0


In [4]:
len(history)

35064

In [5]:
X = history.drop("Load",axis=1)
y = history["Load"]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

Support Vector Machines - Regression
There are three different implementations of Support Vector Regression: SVR, NuSVR and LinearSVR. 
LinearSVR provides a faster implementation than SVR but only considers the linear kernel, 
while NuSVR implements a slightly different formulation than SVR and LinearSVR. See Implementation 
details for further details.

In [11]:
from sklearn.svm import SVR,LinearSVR

Setting C: C is 1 by default and it’s a reasonable default choice. If you have a lot of noisy observations you should decrease it: decreasing C corresponds to more regularization.

LinearSVC and LinearSVR are less sensitive to C when it becomes large, and prediction results stop improving after a certain threshold. Meanwhile, larger C values will take more time to train, sometimes up to 10 times longer

Epsilon: https://stats.stackexchange.com/questions/259018/meaning-of-epsilon-in-svm-regression

In [12]:
base_model = SVR()

In [13]:
base_model.fit(scaled_X_train,y_train)

In [14]:
base_preds = base_model.predict(scaled_X_test)

Evaluation

In [15]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [16]:
mean_absolute_error(y_test,base_preds)

337079.48441972566

In [17]:
test_RMSE = np.sqrt(mean_squared_error(y_test,base_preds))
test_RMSE

433585.2912574081

In [18]:
y_test.mean()

1340897.995674905

In [19]:
percent_RMSE = (test_RMSE/y_test.mean())*100
percent_RMSE

32.33544182003006

In [20]:
# This is imported from the supplemental .py file
# https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane.html
from svm_margin_plot import plot_svm_boundary

Grid Search in Attempt for Better Model

In [21]:
param_grid = {'C':[0.1,0.5,1],
             'kernel':['rbf','poly'],
              'degree':[2,3],
              'epsilon':[0,0.01,0.1]}

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
svr = SVR()
grid = GridSearchCV(svr,param_grid=param_grid)

In [24]:
grid.fit(scaled_X_train,y_train)

In [25]:
grid.best_params_

{'C': 1, 'epsilon': 0.01, 'kernel': 'rbf'}