In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
import json
from scipy.stats import loguniform

from tools import create_x_y

In [5]:
def save_results(to_write, filename):
    import json
    with open(f"{filename}.txt", "w") as file:
        file.write(json.dumps(to_write))

In [10]:
def measure(x, y):
    clf = LogisticRegression()

    params = {
        "solver": ["newton-cg", "lbfgs", "liblinear"],
        "penalty": ["none", "l1", "l2", "elasticnet"],
        "C": loguniform(1e-5, 100),
    }
    time_split = TimeSeriesSplit(n_splits=5)

    clf_search = RandomizedSearchCV(
        clf,
        param_distributions=params,
        n_iter=500,
        n_jobs=-1,
        cv=time_split,
        random_state=42,
    )

    split = int(0.8 * len(x))
    clf_search.fit(x[:split], y[:split])
    y_pred = clf_search.predict(x[split:])

    f1 = f1_score(y[split:], y_pred)
    acc = accuracy_score(y[split:], y_pred)
    roc = roc_auc_score(y[split:], y_pred)

    return round(acc, 3), round(f1, 3), round(roc, 3)

In [None]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
    }
weight = "baseline_2"

acc = {}
f1_metric = {}
roc = {}

path = f"../datasets/v3/binned/"

for company in full_names.keys():
    df = pd.read_csv(path + f"{company}.csv")
    OTHER_COLS = ["open", "high", "low", "close", "vol"]
    
    acc[company] = {}
    f1_metric[company] = {}
    roc[company] = {}

    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")

        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )

with open(f"../results/v3/LogReg_BASE.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

In [None]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
    }
weight = "baseline_2"

acc = {}
f1_metric = {}
roc = {}

path = f"../datasets/v3/binned/"

for company in full_names.keys():
    df = pd.read_csv(path + f"{company}.csv")
    VITAL_COLS = ["post_date", "ticker", "close", "high", "low", "bin_2", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    
    acc[company] = {}
    f1_metric[company] = {}
    roc[company] = {}

    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")

        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )

with open(f"../results/v3/LogReg_all.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))