In [57]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.utils.class_weight import compute_sample_weight
import matplotlib.pyplot as plt
from tools import create_x_y

In [36]:
df = pd.read_csv('../datasets/relevant/amzn.csv')

In [42]:
VITAL_COLS = ["date_", "ticker"]#, "bin_2", "bin_3", "bin_5"]
OTHER_COLS = [i for i in df.columns if i not in VITAL_COLS]
print(OTHER_COLS)


def encode_y(y):
    if len(set(y)) == 2:
        remap_dict = {-1: 0, 1: 1}
    elif len(set(y)) == 3:
        remap_dict = {-1: 0, 0: 1, 1: 2}
    elif len(set(y)) == 5:
        remap_dict = {-2: 0, -1: 1, 0: 2, 1: 3, 2: 4}
    for i, val in enumerate(y):
        y[i] = remap_dict[val]

    return y

def weight_data(x):
    # least represented class is 1.0, others are percentage_least/percentage_other
    percentages = {}
    len_x = len(x)
    for uniq_val in set(x):
        percentages[uniq_val] = sum([1 for i in x if i==uniq_val])/len_x
    least = min(percentages, key=percentages.get)
    weights = {i:percentages[least]/percentages[i] for i in percentages}
    return weights


['close', 'diffs', 'bin_2', 'bin_3', 'bin_5', 'low', 'vol', 'max_pos', 'std_neg', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'median_comp', 'count']


In [65]:
def measure_acc(x, y, binary=True):
    mode = "binary:logistic"
    if not binary:
        mode = "multi:softprob"
    clf = xgb.XGBClassifier(objective=mode, random_state=42)#, sample_weight=compute_sample_weight(weight_data(y), y))

    params = {
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 0.5),
        "learning_rate": uniform(0.003, 0.3),  # default 0.1
        "max_depth": randint(2, 6),  # default 3
        "n_estimators": randint(100, 400),  # default 100
        "subsample": uniform(0.6, 0.4), 
    }
    time_split = TimeSeriesSplit(n_splits=5)
    xgb_search = RandomizedSearchCV(
        clf,
        param_distributions=params,
        random_state=42,
        n_iter=20,
        cv=time_split,
        verbose=1,
        n_jobs=4,
    )
    split = int(0.8 * len(x))
    xgb_search.fit(x[:split], y[:split])
    y_pred = xgb_search.predict(x[split:])
    cm = confusion_matrix(y[split:], y_pred)
    acc = sum(y_pred == y[split:]) / len(y_pred)

    return cm, acc

In [59]:
x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_2", lag=16)
compute_sample_weight(weight_data(y), y)

array([1.       , 1.       , 0.9332134, ..., 0.9332134, 1.       ,
       0.9332134])

In [122]:
# cm = {}
# acc = {}
# for lag in [3, 6, 10, 16]:
#     x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_3", lag=lag)
#     y = encode_y(y)
#     cm[lag], acc[lag] = measure_acc(x, y, binary=False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [11]:

split = int(0.8 * len(x))
zeroes = pd.Series([0 for _ in range(len(y[split:]))])
acc = sum(zeroes == y[split:]) / len(y[split:])
print(acc)

0.5436046511627907


In [68]:
x, y = create_x_y(df, x_cols=OTHER_COLS, y_col="bin_3", lag=16)
y = encode_y(y)
cm16, acc16 = measure_acc(x, y, binary=False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
acc16

In [67]:
acc16

0.5622093023255814

In [51]:
cm16

array([[270, 549],
       [207, 694]], dtype=int64)