In [113]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt
from tools import create_x_y

In [52]:
print(uniform(0.7, 0.3))

<scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BB987474C0>


In [53]:
df = pd.read_csv('../datasets/relevant/amzn.csv')

In [107]:
VITAL_COLS = ["date_", "ticker", "close", "bin_2", "bin_3", "bin_5"]
OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
print(OTHER_COLS)


def encode_y(y):
    if len(set(y)) == 3:
        remap_dict = {-1: 0, 0: 1, 1: 2}
    elif len(set(y)) == 5:
        remap_dict = {-2: 0, -1: 1, 0: 2, 1: 3, 2: 4}
    for i, val in enumerate(y):
        y[i] = remap_dict[val]

    return y

['diffs', 'low', 'vol', 'max_pos', 'std_neg', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'median_comp', 'count']


In [120]:
def measure_acc(x, y, binary=True):
    mode = "binary:logistic"
    if not binary:
        mode = "multi:softprob"
    clf = xgb.XGBClassifier(objective=mode, random_state=42)

    params = {
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 0.5),
        "learning_rate": uniform(0.003, 0.3),  # default 0.1
        "max_depth": randint(2, 6),  # default 3
        "n_estimators": randint(100, 400),  # default 100
        "subsample": uniform(0.6, 0.4),
    }
    time_split = TimeSeriesSplit(n_splits=5)
    xgb_search = RandomizedSearchCV(
        clf,
        param_distributions=params,
        random_state=42,
        n_iter=20,
        cv=time_split,
        verbose=1,
        n_jobs=4,
    )
    split = int(0.8 * len(x))
    xgb_search.fit(x[:split], y[:split])
    y_pred = xgb_search.predict(x[split:])
    cm = confusion_matrix(y[split:], y_pred)
    acc = sum(y_pred == y[split:]) / len(y_pred)

    return cm, acc

In [122]:
cm = {}
acc = {}
for lag in [3, 6, 10, 16]:
    x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_3", lag=lag)
    y = encode_y(y)
    cm[lag], acc[lag] = measure_acc(x, y, binary=False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[[ 33 270  83]
 [ 52 815  68]
 [ 44 268  89]] 0.5441347270615563


In [124]:
x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_3")
y = encode_y(y)
cm16, acc16 = measure_acc(x, y, binary=False)

KeyboardInterrupt: 