In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from scipy.stats import uniform, randint
import json

from tools import create_x_y

In [3]:
#df = pd.read_csv('../datasets/relevant/amzn.csv')

#names = ["AMZN", "AAPL", "MSFT", "TSLA", "GOOG", "GOOGL"]
company = "AMZN"
df = pd.read_csv(f'../datasets/v3/more_cols_binned/no_weight/{company}.csv', parse_dates=["post_date"])

In [5]:
VITAL_COLS = ["post_date", "ticker", "close", "bin_3"]
OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
print(OTHER_COLS)

['open', 'high', 'low', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs']


In [6]:
def save_results(to_write, filename):
    import json
    with open(f"{filename}.txt", "w") as file:
        file.write(json.dumps(to_write))

In [7]:
def weight_data(x):
    # least represented class is 1.0, others are percentage_least/percentage_other
    percentages = {}
    len_x = len(x)
    for uniq_val in set(x):
        percentages[uniq_val] = sum([1 for i in x if i==uniq_val])/len_x
    least = min(percentages, key=percentages.get)
    weights = {i:percentages[least]/percentages[i] for i in percentages}
    return weights

In [24]:
def measure(x, y):
    clf = DecisionTreeClassifier()

    params = {
        'max_depth': [2, 3, 5, 10, 20, 50, 100, 200],
        'min_samples_leaf': [2, 3, 5, 10],
        'criterion': ["gini", "entropy"]
    }
    time_split = TimeSeriesSplit(n_splits=5)

    tree_search = GridSearchCV(
        clf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=4,
    )
    split = int(0.8 * len(x))
    tree_search.fit(x[:split], y[:split])
    y_pred = tree_search.predict(x[split:])
    
    acc = accuracy_score(y[split:], y_pred)
    f1 = f1_score(y[split:], y_pred, average="weighted")
    
    return f1, acc

In [29]:
# ===== BASELINE =====

full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
}
weight = "no_weight"
acc = {}
cm = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/more_cols_binned/{weight}/{company}.csv")
    VITAL_COLS = ["post_date", "ticker", "close", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    print(OTHER_COLS)
    acc[company] = {}
    f1_metric[company] = {}


    for lag in [3, 7, 14]:
        x, y = create_x_y(df, x_cols=["open", "close", "vol", "high"], y_col="bin_3", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag] = measure(
            x, y
        )
        
with open(f"DT_v3__acc_f1_baseline.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric)
    file.write(json.dumps(to_write))

['open', 'high', 'low', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs']
Starting amzn, 3.
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Starting amzn, 7.
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Starting amzn, 14.
Fitting 5 folds for each of 64 candidates, totalling 320 fits
['open', 'high', 'low', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs']
Starting aapl, 3.
Fitting 5 folds f

In [30]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
}
weight = "w1"

acc = {}
cm = {}
f1_metric = {}


for company in full_names.keys():
    df = pd.read_csv(f"../datasets/v3/more_cols_binned/{weight}/{company}.csv")
    VITAL_COLS = ["post_date", "ticker", "close", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    print(OTHER_COLS)
    acc[company] = {}
    f1_metric[company] = {}


    for lag in [3, 7, 14]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_3", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag] = measure(
            x, y
        )
        
with open(f"DT_v3__acc_f1_{weight}.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric)
    file.write(json.dumps(to_write))

['open', 'high', 'low', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs']
Starting amzn, 3.
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Starting amzn, 7.
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Starting amzn, 14.
Fitting 5 folds for each of 64 candidates, totalling 320 fits
['open', 'high', 'low', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs']
Starting aapl, 3.
Fitting 5 folds f