In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from scipy.stats import uniform, randint

import shap

import json 
from tools import create_x_y

In [4]:
def shap_features(model, train, test):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(test)
    shap.force_plot(explainer.expected_value[0], shap_values[0], test)
    return

In [7]:
def measure(x, y):

    rf = RandomForestClassifier(random_state=42)

    params = {
        "n_estimators": [200, 500, 800],
        "max_depth": [4, 6, 8],
        "criterion": ["gini", "entropy"],
    }

    time_split = TimeSeriesSplit(n_splits=5)

    clf_search = GridSearchCV(
        rf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=-1,
    )

    split = int(0.8 * len(x))
    clf_search.fit(x[:split], y[:split])
    y_pred = clf_search.predict(x[split:])
    f1 = f1_score(y[split:], y_pred)
    acc = accuracy_score(y[split:], y_pred)
    roc = roc_auc_score(y[split:], y_pred)

    shap_features(clf_search.best_estimator_, x[:split], x[split:])
    # print(clf_search.best_estimator_.predict_proba)

    return round(f1, 3), round(acc, 3), round(roc, 3)


In [8]:
# ===== TESTING =====

full_names = {
    "amzn": "Amazon",
    #"aapl": "Apple",
    #"msft": "Microsoft",
    #"tsla": "Tesla",
    #"goog": "Google (GOOG)",
    #"googl": "Google (GOOGL)",
}
acc = {}
roc = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/binned/{company}.csv")
    OTHER_COLS = ["open", "high", "close", "vol"]
    print(OTHER_COLS)
    acc[company] = {}
    roc[company] = {}
    f1_metric[company] = {}


    for lag in [1]:#, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=["open", "close", "vol", "high"], y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )
        
#with open(f"../results/v3/ADA_base.txt", "w") as file:
#    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
#    file.write(json.dumps(to_write))

['open', 'high', 'close', 'vol']
Starting amzn, 1.
Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [None]:
# ===== BASELINE =====

full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
}
acc = {}
roc = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/binned/{company}.csv")
    OTHER_COLS = ["open", "high", "close", "vol"]
    print(OTHER_COLS)
    acc[company] = {}
    roc[company] = {}
    f1_metric[company] = {}


    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=["open", "close", "vol", "high"], y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )
        
with open(f"../results/v3/ADA_base.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

In [None]:
# ===== all =====

full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
}
weight = "no_weight"
acc = {}
roc = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/binned/{company}.csv")
    VITAL_COLS = ["post_date", "ticker", "bin_2", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    print(OTHER_COLS)
    acc[company] = {}
    roc[company] = {}
    f1_metric[company] = {}


    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )
        
with open(f"../results/v3/ADA_all.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))