In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from tools import create_x_y

In [15]:
#names = ["AMZN", "AAPL", "MSFT", "TSLA", "GOOG", "GOOGL"]
company = "AMZN"
df = pd.read_csv(f'../datasets/v3/more_cols_binned/no_weight/{company}.csv', parse_dates=["post_date"])

In [16]:
df.columns

Index(['post_date', 'ticker', 'open', 'high', 'low', 'close', 'vol',
       'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min',
       'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median',
       'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min',
       'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min',
       'pos_std', 'retweet_num_sum', 'relative_count', 'diffs', 'bin_3'],
      dtype='object')

In [17]:
VITAL_COLS = ["post_date", "ticker", "close"]
OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
print(OTHER_COLS)

['open', 'high', 'low', 'vol', 'comment_num_sum', 'comp_max', 'comp_mean', 'comp_median', 'comp_min', 'comp_std', 'like_num_sum', 'neg_max', 'neg_mean', 'neg_median', 'neg_min', 'neg_std', 'neu_max', 'neu_mean', 'neu_median', 'neu_min', 'neu_std', 'count', 'pos_max', 'pos_mean', 'pos_median', 'pos_min', 'pos_std', 'retweet_num_sum', 'relative_count', 'diffs', 'bin_3']


In [18]:
def save_results(to_write, filename):
    import json
    with open(f"{filename}.txt", "w") as file:
        file.write(json.dumps(to_write))

In [22]:
def measure(x, y):
    clf = LogisticRegression()

    params = {
        "penalty": ["l1", "l2"],
        "C": np.logspace(-4, 4, 20),
        #"solver": ["lbfgs", "liblinear", "sag", "saga"],
    }
    time_split = TimeSeriesSplit(n_splits=5)

    clf_search = GridSearchCV(
        clf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=4,
    )

    split = int(0.8 * len(x))
    clf_search.fit(x[:split], y[:split])
    y_pred = clf_search.predict(x[split:])
    cm = confusion_matrix(y[split:], y_pred)
    acc = sum(y_pred == y[split:]) / len(y_pred)

    return cm, acc


In [31]:
acc = {}
cm = {}
for lag in [3]: #6, 10, 16]:
    x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_3", lag=lag)
    cm[lag], acc[lag] = measure(x, y)

#save_results(f"", filename="../results/log_reg/log_reg_more_cols_unweighted")

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [30]:
acc

{3: 0.37122969837587005}

In [32]:
acc

{3: 0.37122969837587005}

In [25]:
cm

{3: array([[153,  90, 298],
        [233, 171, 211],
        [161,  91, 316]], dtype=int64)}