In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:
def svr_forecast_helper(model, last_window, horizon):
    predictions = []
    input_sequence = last_window.copy()

    for _ in range(horizon):
        input_array = np.array(input_sequence[-len(last_window):]).reshape(1, -1)
        next_prediction = model.predict(input_array)[0]
        predictions.append(next_prediction)
        input_sequence.append(next_prediction)
    
    return predictions

In [10]:
# copied from arima utils

def rolling_origin_eval_prep(df_train, df_test, horizon):
    train_split, test_split = [], [] # initialize empty lists to store each train and test split
    for i in range(0, len(df_test) - horizon + 1):
        curr_train = pd.concat([df_train, df_test.iloc[:i]], axis = 0)
        curr_test = df_test.iloc[i:i + horizon] # create a test windos from i to i+horizon, meaning select the next "horizon step" of test data starting from i
        train_split.append(curr_train)
        test_split.append(curr_test)
    return train_split, test_split

def cal_smape(actual, forecast):
    actual, forecast = np.array(actual), np.array(forecast) # convert actual and forecasted values to numpy arrays
    denominator = (np.abs(actual) + np.abs(forecast)) / 2
    difference = np.abs(actual - forecast) / denominator
    difference = np.where(denominator == 0, 0, difference) # to avoid dividing by 0 or NaN
    return 100 * np.mean(difference)


def evaluate(actual, forecast):
    mae = round(mean_absolute_error(actual, forecast), 3)
    mse = round(mean_squared_error(actual, forecast), 3)
    rmse = round(np.sqrt(mse), 3)
    smape = round(cal_smape(actual, forecast), 3)
    return mae, mse, rmse, smape


def data_prep(test, dataset):
    df_train = pd.read_csv(f"../data/train/{dataset}_train.csv")
    df_test = pd.read_csv(f"../data/test/{dataset}_test_{test}.csv")

    if dataset == "weather":
        df_train.rename(columns = {"date": "date", "temperature": "y"}, inplace = True)
        df_test.rename(columns = {"date": "date", "temperature": "y"}, inplace = True)

    if dataset == "carbon":
        df_train.rename(columns = {"date": "date", "carbon_intensity": "y"}, inplace = True)
        df_test.rename(columns = {"date": "date", "carbon_intensity": "y"}, inplace = True)

    return df_train, df_test


In [None]:

dataset_label = ["weather", "carbon"]
test_size_label = ["small", "large"]
horizons = [5, 50]
all_metrics = []


for dataset in dataset_label:
    for horizon in horizons:
        if horizon == 5:
            test_size = "small"
        elif horizon == 50:
            test_size = "large"
        
        print(f"Testing!\n")
        print(f"{dataset} dataset: running horizon {horizon} (test size {test_size})\n")

        df_train, df_test = data_prep(test_size, dataset)
        train_split, test_split = rolling_origin_eval_prep(df_train, df_test, horizon)

        forecast_df = pd.DataFrame()
        forecast_df["date"] = df_test.index
        forecast_df["y"] = df_test["y"].values

        window_size = 3

        for i, (curr_train, curr_test) in enumerate(zip(train_split, test_split)):

            y_train = curr_train["y"].values

            # Creating the sliding window training data
            X_train, y_target = [], []
            for j in range(len(y_train) - window_size):
                X_train.append(y_train[j:j + window_size])
                y_target.append(y_train[j + window_size])
            
            model = SVR()
            model.fit(X_train, y_target)

            last_window = list(y_train[-window_size:])
            forecast = svr_forecast_helper(model, last_window, horizon)

            actual = curr_test["y"].values

            i_forecast = np.concatenate([np.repeat(np.nan, i), forecast, np.repeat(np.nan, len(df_test) - i - horizon)])
            forecast_df[f"{i}"] = i_forecast

            mae, mse, rmse, smape = evaluate(actual, forecast)

            metadata_dict = {"dataset": dataset,
                        "test_size": test_size,
                        "horizon": horizon,
                        "iter": i,
                        "mae": round(mae),
                        "mse": round(mse),
                        "rmse": round(rmse),
                        "smape": round(smape)}
            
            forecast_df.to_csv(f"../out/svr_{horizon}_{dataset}.csv", index = False)
            all_metrics.append(metadata_dict)

    metrics_df = pd.DataFrame(all_metrics)
    metrics_df.to_csv(f"../out/svr_results.csv")

In [None]:
# check that I calculate sMAPE correctly... might not, because I got 146 value. itr 3. weather, 5 h.

In [84]:
# grid search

# params? kernel, gamme, C, epsilon, windows size
# model = SVR(kernel='rbf', gamma=0.5, C=10, epsilon=0.05)

param_dict = {"C": [0.1, 1, 10, 100],
                "epsilon": [0.01, 0.1, 0.5],
                "gamma": ["scale", "auto", 0.1, 0.5]}
windows = [5, 10, 24] #?

In [81]:
from sklearn.model_selection import GridSearchCV

In [None]:
svr = SVR()
grid_search = GridSearchCV(estimator = svr, param_grid = param_dict, verbose = 2, n_jobs = -1)

grid_search.fit(...)
best_csv = grid_search.best_estimator_
