In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from tools import create_x_y

In [2]:
#names = ["AMZN", "AAPL", "MSFT", "TSLA", "GOOG", "GOOGL"]
company = "AMZN"
df = pd.read_csv(f'../datasets/v2.1/more_cols_binned/no_weight/{company}.csv', parse_dates=["post_date"])

In [3]:
df.columns

Index(['post_date', 'ticker', 'open', 'high', 'low', 'close', 'vol',
       'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min',
       'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median',
       'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min',
       'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min',
       'pos_std', 'retweet_num_sum', 'relative_count', 'diffs', 'bin_2',
       'bin_3'],
      dtype='object')

In [7]:
VITAL_COLS = ["post_date", "ticker", "close", "bin_2", "bin_3", ]
OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]


In [18]:
def save_results(to_write, filename):
    import json
    with open(f"{filename}.txt", "w") as file:
        file.write(json.dumps(to_write))

In [10]:
def measure(x, y):
    clf = LogisticRegression()

    params = {
        "penalty": ["l1", "l2"],
        "C": np.logspace(-4, 4, 20),
    }
    time_split = TimeSeriesSplit(n_splits=5)

    clf_search = GridSearchCV(
        clf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=-1,
    )

    split = int(0.8 * len(x))
    clf_search.fit(x[:split], y[:split])
    y_pred = clf_search.predict(x[split:])
    f1 = f1_score(y[split:], y_pred)
    acc = accuracy_score(y[split:], y_pred)

    return f1, acc

def baseline_measure(lag):
    base_cols = ["open", "high", "low", "close", "vol"]
    x, y = create_x_y(df, x_cols=base_cols, y_col="bin_2", lag=lag)
    return measure(x, y)


In [None]:
acc = {}
cm = {}
for lag in [3]: #6, 10, 16]:
    cm[lag], acc[lag] = baseline_measure(lag)

save_results(f"LR_base", filename="../results/log_reg/log_reg_more_cols_unweighted")

In [None]:
acc = {}
cm = {}
for lag in [3]: #6, 10, 16]:
    x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=lag)
    cm[lag], acc[lag] = measure(x, y)

save_results(f"", filename="../results/log_reg/log_reg_more_cols_unweighted")