Philippe Joly
MAIS 202

This is an implementation of Support Vector Regression (SVR) on Electrical Power Output based on the Weather in Quebec

In [1]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime as dt
import pickle
import os
from dotenv import load_dotenv
load_dotenv()

True

# Data

In [2]:
data = pd.read_csv(os.getenv("DATA"))
# data['Date/Time (UTC)'] = pd.to_datetime(data['Date/Time (UTC)'])
data.dropna(inplace=True)
data.drop(columns=['Date/Time (UTC)','Year'], inplace=True)

X_tot = data.drop(columns=["Average Power Output (MW)"]).values
y_tot = data["Average Power Output (MW)"].values

X, X_test, y, y_test = train_test_split(X_tot, y_tot, test_size=0.1, random_state=45)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=45)

In [3]:
print(f'Training set size: {X.shape[0]}')
print(f'Validation set size: {X_test.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')
print(f'Number of Features: {X.shape[1]}')
print(data.drop(columns=["Average Power Output (MW)"]).keys().values)

Training set size: 31514
Validation set size: 1751
Test set size: 1751
Number of Features: 6
['Temp (°C)' 'Month' 'Day' 'Hour' 'Day of Week' 'Population']


In [4]:
y = y.reshape(len(y), 1)
y_val = y_val.reshape(len(y_val), 1)
y_test = y_test.reshape(len(y_test), 1)

x_scaler = StandardScaler().fit(X)
y_scaler = StandardScaler().fit(y)

X = x_scaler.transform(X)
y = y_scaler.transform(y)

X_val = x_scaler.transform(X_val)
y_val = y_scaler.transform(y_val)

X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)

y = y.reshape(y.shape[0])
y_val = y_val.reshape(y_val.shape[0])
y_test = y_test.reshape(y_test.shape[0])

# Hyper-Parameter Tuning

In [5]:
svr = SVR()

In [6]:
opt_params=[{
    'C': np.logspace(-4,4,10),
    'gamma': np.logspace(-6,1,10),
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'kernel': ['rbf'],
    'max_iter':[2000]
}]

In [7]:
clf = BayesSearchCV(svr, search_spaces=opt_params, cv=3, verbose=False, n_jobs=-1, n_iter=200)
best_clf = clf.fit(X,y)



In [8]:
results = pd.DataFrame(best_clf.cv_results_).sort_values(by="rank_test_score", ascending=True)

In [9]:
results.head(25)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,param_gamma,param_kernel,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
183,3.250043,0.025728,4.332995,0.027071,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
180,3.396229,0.063735,4.03471,0.058293,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
181,3.159868,0.029137,3.49504,0.062871,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
147,3.211711,0.040855,3.784475,0.074041,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
143,3.413598,0.053275,3.52286,0.025178,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
142,3.235664,0.053482,3.727103,0.024926,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
168,4.344091,0.038427,4.13312,0.0471,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
20,3.332903,0.053059,3.930169,0.134735,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
175,3.973553,0.032567,3.963091,0.097617,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1
144,2.798308,0.032144,3.695875,0.065817,2.782559,0.01,1.668101,rbf,2000,"{'C': 2.782559402207126, 'epsilon': 0.01, 'gam...",0.98184,0.981411,0.98131,0.98152,0.00023,1


In [10]:
best_clf.best_score_

0.9815202764457135

In [11]:
model_ls = []
err = []
for i in range(10):
    err.append([])
    model = SVR(**results['params'].iloc[i])
    model.fit(X, y)
    y_pred = model.predict(X_val)
    err[i].append(mean_squared_error(y_val, y_pred))
    err[i].append(r2_score(y_val, y_pred))
    err[i].append(mean_absolute_error(y_val, y_pred))
    model_ls.append(model)



In [12]:
err

[[0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294],
 [0.01832972505821601, 0.9815441917568631, 0.10808191536239294]]

In [13]:
f = "svr_bayes_rbf"
pickle.dump(model_ls[0], open(f, "wb"))

In [14]:
# svr_rbf = SVR()

In [15]:
# opt_params=[{
#     'C': np.logspace(-4,4,10),
#     'gamma': np.logspace(-6,1,10),
#     'epsilon': [0.01, 0.1, 0.2, 0.5],
#     'tol': [1e-3, 1e-4, 1e-5],
#     'kernel': ['rbf'],
#     'max_iter': [100,1000]
# }]

In [16]:
# clf_rbf = BayesSearchCV(svr_rbf, search_spaces=opt_params, cv=3, verbose=False, n_jobs=-1, n_iter=200)
# best_clf_rbf = clf_rbf.fit(X,y)

In [17]:
# results_rbf = pd.DataFrame(best_clf_rbf.cv_results_).sort_values(by="rank_test_score", ascending=True, ignore_index=True)

In [18]:

# for i in range(100):
#     print(results_rbf['params'].iloc[i])
# results_rbf.tail(25)

In [19]:
# best_clf_rbf.best_score_