## SVM

### For all companies measure acc, f1 for different lags using different columns

In [6]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler


import matplotlib.pyplot as plt

import json
from tools import create_vectors, create_x_y


In [7]:
def measure(x, y):
    clf = SVC(random_state=42)

    params = {
        #"C": [0.1, 1, 10],
        "gamma": [1, 0.01, 0.001],
        "kernel": ["rbf"],
    }
    #params = {}

    time_split = TimeSeriesSplit(n_splits=5)

    svm = GridSearchCV(
        clf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=-1,
    )

    svm = make_pipeline(StandardScaler(), svm)

    split = int(0.8 * len(x))
    svm.fit(x[:split], y[:split])
    y_pred = svm.predict(x[split:])

    acc = accuracy_score(y[split:], y_pred)
    f1 = f1_score(y[split:], y_pred, average="weighted")
    roc = roc_auc_score(y[split:], y_pred)

    return round(f1, 3), round(acc, 3), round(roc, 3)

In [8]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
    }

acc = {}
f1_metric = {}
roc = {}

path = f"../datasets/v3/binned/"

for company in full_names.keys():
    df = pd.read_csv(path + f"{company}.csv", parse_dates=["post_date"])
    OTHER_COLS = ["open", "high", "low", "close", "vol"]

    acc[company] = {}
    f1_metric[company] = {}
    roc[company] = {}
    fi = {}

    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
           x, y
        )

with open(f"../results/v3/SVM_base_SS.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " \nF1: " + json.dumps(f1_metric) + " \nROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

Starting amzn, 1.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 2.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 3.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 6.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 8.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 1.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 2.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 3.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 6.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 8.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting msft, 1.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting msft, 2.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting msft, 3.
Fitting 5 folds for each of 3 candidates, tota

In [9]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
    }

acc = {}
f1_metric = {}
roc = {}

path = f"../datasets/v3/binned/"

for company in full_names.keys():
    df = pd.read_csv(path + f"{company}.csv", parse_dates=["post_date"])
    VITAL_COLS = ["post_date", "ticker", "bin_2", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    
    acc[company] = {}
    f1_metric[company] = {}
    roc[company] = {}
    fi = {}

    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )

with open(f"../results/v3/SVM_all_ss.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " \nF1: " + json.dumps(f1_metric) + " \nROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

Starting amzn, 1.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 2.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 3.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 6.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting amzn, 8.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 1.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 2.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 3.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 6.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting aapl, 8.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting msft, 1.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting msft, 2.
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Starting msft, 3.
Fitting 5 folds for each of 3 candidates, tota