In [1]:
import numpy as np
import os
import sys
from catboost import CatBoostClassifier
from collections import Counter
sys.path.append("../utils")
from pool_iterator import pool_iterator
from metric import calculate_metric
from json_tools import get_from_pool
from itertools import chain

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train_days_filenames = [
    "../../../data/new_days_data_best_features/day_1.json",
    "../../../data/new_days_data_best_features/day_2.json",
    "../../../data/new_days_data_best_features/day_3.json"
][:1]
test_day_filename = "../../../data/new_days_data_best_features/day_6.json"
position_variants = list(range(10)) + [100]

In [3]:
def make_features(filenames, position):
    return np.array([
        json["factors"]
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

def make_labels(filenames, position):
    return np.array([
        1 if json["target"] > 0 else 0
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

In [4]:
%%time
train_features = [
    make_features(train_days_filenames, position)
    for position in position_variants
]
train_labels = [
    make_labels(train_days_filenames, position)
    for position in position_variants
]

CPU times: user 2min 3s, sys: 2.09 s, total: 2min 5s
Wall time: 2min 6s


In [5]:
def get_positions(features, models):
    predictions = np.array([
        model.predict_proba(features)[:, 1]
        for model in models
    ])
    return np.array(position_variants)[np.argmax(predictions, axis=0)]

In [6]:
%%time
test_features = get_from_pool(pool_iterator(test_day_filename), "factors", np.array)
test_targets = get_from_pool(pool_iterator(test_day_filename), "target", float)
test_probas = get_from_pool(pool_iterator(test_day_filename), "p", float)
test_positions = get_from_pool(pool_iterator(test_day_filename), "pos", int)

CPU times: user 24.5 s, sys: 270 ms, total: 24.8 s
Wall time: 25 s


In [7]:
def train_models():
    models = [CatBoostClassifier(verbose=False) for position in position_variants]
    for features, labels, model in zip(train_features, train_labels, models):
        model.fit(features, labels)
    return models

In [8]:
def test_models(models):
    answers = get_positions(test_features, models)
    return calculate_metric(answers, test_positions, test_targets, test_probas)

In [9]:
metrics = []

In [10]:
for i in range(1000):
    metric = test_models(train_models())
    print('{}) {}'.format(i, metric))
    metrics.append(metric)

0) 0.614189293846602
1) 0.6107913630523424
2) 0.6037967830750739
3) 0.6076002901004758
4) 0.6095618555931865
5) 0.6100069845243922
6) 0.6173867969481607
7) 0.6171057851182431
8) 0.6053822867981178
9) 0.6104277814064417
10) 0.6074443349091523
11) 0.614088492287951
12) 0.612556083253592
13) 0.6117519592404753
14) 0.6132826369393172
15) 0.6196646399335254
16) 0.6104560119389763
17) 0.604598182167581
18) 0.6070042056020428
19) 0.6175041653025443
20) 0.6035297191867701
21) 0.6074832786827826
22) 0.6177254855283659
23) 0.6061901535218691
24) 0.613648872279528
25) 0.6118181443778227
26) 0.6110033489246717
27) 0.6048440482906862
28) 0.5966325934582506
29) 0.6096928148199997
30) 0.6144832087130592
31) 0.6197909049697459
32) 0.6121752466995963
33) 0.6147492057180458
34) 0.6143209090996617
35) 0.6125103567825619
36) 0.6093072649741937
37) 0.6049345885124117
38) 0.6240437857279852
39) 0.6144364431996389


KeyboardInterrupt: 

In [11]:
len(metrics), np.mean(metrics), np.std(metrics) / np.sqrt(len(metrics))

(40, 0.611198007637646, 0.0008367063817216818)

In [None]:
predictions = np.array([
    model.predict_proba(features)[:, 1]
    for model in models
])

In [None]:
i = 0
counter = 0
while counter < 10:
    if np.argmax(predictions[:, i]) == test_positions[i]:
        axis = plt.gca()
        plt.plot(predictions[:,i], color='g' if test_targets[i] > 0 else 'r')
        plt.plot([test_positions[i]] * 2 ,[0, 1], color='b')
        axis.set_ylim(0, 1)
        axis.set_xlim(-0.1, 10.1)
        plt.show()
        counter += 1
    i += 1