In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from scipy.stats import uniform, randint
import json

from tools import create_x_y

Model drzewa decyzyjnego

In [3]:
def save_results(to_write, filename):
    import json
    with open(f"{filename}.txt", "w") as file:
        file.write(json.dumps(to_write))

In [4]:
def measure(x, y):
    clf = DecisionTreeClassifier()

    params = {
        'max_depth': [5, 10, 50, 100, 200],
        'min_samples_leaf': [2, 3, 5, 10],
        'criterion': ["gini", "entropy"]
    }
    time_split = TimeSeriesSplit(n_splits=5)

    tree_search = GridSearchCV(
        clf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=4,
    )
    split = int(0.8 * len(x))
    tree_search.fit(x[:split], y[:split])
    y_pred = tree_search.predict(x[split:])
    
    acc = accuracy_score(y[split:], y_pred)
    f1 = f1_score(y[split:], y_pred, average="weighted")
    roc = roc_auc_score(y[split:], y_pred)

    return round(f1, 3), round(acc, 3), round(roc, 3)

In [5]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
    }

acc = {}
f1_metric = {}
roc = {}

path = f"../datasets/v3/binned/"

for company in full_names.keys():
    df = pd.read_csv(path + f"{company}.csv", parse_dates=["post_date"])
    OTHER_COLS = ["open", "high", "low", "close", "vol"]

    acc[company] = {}
    f1_metric[company] = {}
    roc[company] = {}
    fi = {}

    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
           x, y
        )

with open(f"../results/v3/DT_BASE.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " \nF1: " + json.dumps(f1_metric) + " \nROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

Starting amzn, 1.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 2.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 3.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 6.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 8.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 1.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 2.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 3.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 6.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 8.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting msft, 1.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting msft, 2.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting msft, 3.
Fitting 5 folds for ea

In [6]:
full_names = {
    "amzn": "Amazon",
    "aapl": "Apple",
    "msft": "Microsoft",
    "tsla": "Tesla",
    "goog": "Google (GOOG)",
    "googl": "Google (GOOGL)",
    }

acc = {}
f1_metric = {}
roc = {}

path = f"../datasets/v3/binned/"

for company in full_names.keys():
    df = pd.read_csv(path + f"{company}.csv", parse_dates=["post_date"])
    VITAL_COLS = ["post_date", "ticker", "bin_2", "bin_3"]
    OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
    
    acc[company] = {}
    f1_metric[company] = {}
    roc[company] = {}
    fi = {}

    for lag in [1, 2, 3, 6, 8]:
        x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
        print(f"Starting {company}, {lag}.")
        acc[company][lag], f1_metric[company][lag], roc[company][lag] = measure(
            x, y
        )

with open(f"../results/v3/DT_no_w.txt", "w") as file:
    to_write = "Acc: " + json.dumps(acc) + " F1: " + json.dumps(f1_metric) + " ROC: " + json.dumps(roc)
    file.write(json.dumps(to_write))

Starting amzn, 1.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 2.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 3.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 6.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting amzn, 8.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 1.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 2.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 3.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 6.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting aapl, 8.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting msft, 1.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting msft, 2.
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Starting msft, 3.
Fitting 5 folds for ea