In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from scipy.stats import uniform, randint
import json 
from tools import create_x_y

In [16]:
def measure(x, y):
    dtc = DecisionTreeClassifier(random_state = 42)
    ada = AdaBoostClassifier(base_estimator = dtc, random_state=42)

    params = {
        "base_estimator__criterion" : ["gini", "entropy"],
        "base_estimator__splitter" :   ["best", "random"],
        "learning_rate": [0.01, 0.1, 1.0],
        "n_estimators": [200, 500, 800],
    }

    

    time_split = TimeSeriesSplit(n_splits=5)

    clf_search = GridSearchCV(
        ada,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=-1,
    )

    split = int(0.8 * len(x))
    clf_search.fit(x[:split], y[:split])
    y_pred = clf_search.predict(x[split:])
    f1 = f1_score(y[split:], y_pred)
    acc = accuracy_score(y[split:], y_pred)
    roc = roc_auc_score(y[split:], y_pred)

    return round(f1, 3), round(acc, 3), round(roc, 3)

In [17]:
# ===== BASELINE =====

full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
}
acc = {}
roc = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/binned/{company}.csv")
    OTHER_COLS = ["open", "high", "close", "vol"]
    print(OTHER_COLS)
    acc[company] = {}
    roc[company] = {}
    f1_metric[company] = {}


    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=["open", "close", "vol", "high"], y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )
        
with open(f"../results/v3/ADA_base.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

['open', 'high', 'close', 'vol']
Starting amzn, 1.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 2.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 3.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 6.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 8.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
['open', 'high', 'close', 'vol']
Starting aapl, 1.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting aapl, 2.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting aapl, 3.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting aapl, 6.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting aapl, 8.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
['open', 'high', 'close', 'vol']
Starting msft, 1.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting msft, 2.
Fit

In [18]:
# ===== all =====

full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
}
weight = "no_weight"
acc = {}
roc = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/binned/{company}.csv")
    VITAL_COLS = ["post_date", "ticker", "bin_2", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    print(OTHER_COLS)
    acc[company] = {}
    roc[company] = {}
    f1_metric[company] = {}


    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )
        
with open(f"../results/v3/ADA_all.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

['open', 'high', 'low', 'close', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'is_negative_sum', 'is_neutral_sum', 'is_positive_sum', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs']
Starting amzn, 1.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 2.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 3.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 6.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Starting amzn, 8.
Fitting 5 folds for each of 36 candidates, totalling 180 fits
['open', 'high', 'low', 'close', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'is_negative_sum', 'is_neutral_sum', 'is_positive_