In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_excel("data/Train.xlsx", sheet_name="Monthly").loc[1:]

new_columns = ["timestamp"]
new_columns.extend(data.columns[1:])
data.columns = new_columns

data.reset_index(inplace=True, drop=True)
data.drop("timestamp", axis=1, inplace=True)

In [3]:
BATCH_SIZE = 20
LEARNING_RATE = 2e-3
RANDOM_SEED = 23

In [4]:
datasets = {}

for dataset in data.columns:
    cur_dataset = np.array([])
    for bias in range(len(data) - BATCH_SIZE):
        cur_row = data[dataset].loc[bias:bias + BATCH_SIZE].values

        if len(cur_dataset):
            cur_dataset = np.append(cur_dataset, [cur_row], axis=0)
        else:
            cur_dataset = [cur_row]

    datasets[dataset.strip()] = cur_dataset

In [5]:
df = np.array([])

for idx, dataset in enumerate(datasets.keys()):
    cur_dataset = np.array([])
    for data in datasets[dataset]:
        data = np.append(data, idx)

        if len(cur_dataset):
            cur_dataset = np.append(cur_dataset, [data], axis=0)
        else:
            cur_dataset = [data]

    if len(df):
        df = np.append(df, [cur_dataset], axis=0)
    else:
        df = [cur_dataset]


In [6]:
df = df.reshape((-1, BATCH_SIZE + 2))

X = df[..., :-1]
y = df[..., -1]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.1, random_state=RANDOM_SEED)

y_train = y_train.astype(int)
y_val = y_val.astype(int)

In [14]:
# clf = CatBoostClassifier(random_seed=RANDOM_SEED, eval_metric="Accuracy", learning_rate=LEARNING_RATE, n_estimators=1e4, task_type="GPU", depth=10, l2_leaf_reg=6, model_size_reg=6, logging_level="Verbose")
# clf.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=1000, plot=True, use_best_model=True, early_stopping_rounds=250)

In [26]:
clf = RandomForestClassifier(n_estimators=int(3e3), random_state=RANDOM_SEED, verbose=1, n_jobs=-1, max_depth=16)
clf.fit(pd.DataFrame(X_train).fillna(0).astype(np.float32), y_train.astype(int))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  1.1min finished


In [27]:
y_pred = clf.predict(pd.DataFrame(X_val).fillna(0).astype(np.float32))

precise_accuracy = np.sum(y_pred == y_val) / len(y_pred)
adjacent_accuracy = np.sum(np.abs(y_pred - y_val) <= 1) / len(y_pred)

print(f"{precise_accuracy=:.3f} {adjacent_accuracy=:.3f}")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    6.2s


precise_accuracy=0.715 adjacent_accuracy=0.741


[Parallel(n_jobs=8)]: Done 3000 out of 3000 | elapsed:    7.6s finished


In [30]:
pd.DataFrame(X).to_csv("data/classifier_train.csv", index=False)
pd.DataFrame(y).to_csv("data/classifier_target.csv", index=False)