In [1]:
import numpy as np
import os
import sys
from catboost import CatBoostClassifier
from collections import Counter
sys.path.append("../utils")
from pool_iterator import pool_iterator
from metric import calculate_metric
from json_tools import get_from_pool
from itertools import chain

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train_days_filenames = [
    "../../../data/pool_with_queries/train_test_split/day_0.json",
    "../../../data/pool_with_queries/train_test_split/day_1.json",
    "../../../data/pool_with_queries/train_test_split/day_2.json",
]
test_day_filename = "../../../data/new_days_data_best_features/day_3.json"
position_variants = list(range(10)) + [100]

In [3]:
def make_features(filenames, position):
    return np.array([
        json["factors"]
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

def make_labels(filenames, position):
    return np.array([
        1 if json["target"] > 0 else 0
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

In [4]:
%%time
train_features = [
    make_features(train_days_filenames, position)
    for position in position_variants
]
train_labels = [
    make_labels(train_days_filenames, position)
    for position in position_variants
]

CPU times: user 2min 1s, sys: 1.07 s, total: 2min 2s
Wall time: 2min 3s


In [5]:
def get_positions(features, models):
    predictions = np.array([
        model.predict_proba(features)[:, 1]
        for model in models
    ])
    return np.array(position_variants)[np.argmax(predictions, axis=0)]

In [6]:
%%time
test_features = get_from_pool(pool_iterator(test_day_filename), "factors", np.array)
test_targets = get_from_pool(pool_iterator(test_day_filename), "target", float)
test_probas = get_from_pool(pool_iterator(test_day_filename), "p", float)
test_positions = get_from_pool(pool_iterator(test_day_filename), "pos", int)

CPU times: user 18.4 s, sys: 470 ms, total: 18.8 s
Wall time: 19.1 s


In [7]:
def train_models():
    models = [CatBoostClassifier(verbose=False) for position in position_variants]
    for features, labels, model in zip(train_features, train_labels, models):
        model.fit(features, labels)
    return models

In [8]:
def test_models(models):
    answers = get_positions(test_features, models)
    return calculate_metric(answers, test_positions, test_targets, test_probas)

In [9]:
metrics = []

In [10]:
for i in range(1000):
    metric = test_models(train_models())
    print('{}) {}'.format(i, metric))
    metrics.append(metric)

0) 0.6262806958670627
1) 0.618296227707532
2) 0.6105406436087526
3) 0.6172150476189924
4) 0.6157261085679079
5) 0.6115489200431778
6) 0.6176054622412227
7) 0.6223177921501097
8) 0.6187563894788184
9) 0.6208772712234393
10) 0.6250440305990581
11) 0.6240969430759744
12) 0.6200362750591157
13) 0.6198001927364684
14) 0.6247541094854008
15) 0.6197390589588724
16) 0.6183208289590205
17) 0.6117635055101466
18) 0.6299449790810546
19) 0.6130307547387543
20) 0.6184809280444624
21) 0.6129788461235315
22) 0.6204988789696351
23) 0.6192906784961507
24) 0.6257941962592725
25) 0.6137552826206499
26) 0.6273296094797842
27) 0.6164068615266577
28) 0.6163066489490218
29) 0.6209488607429834
30) 0.6223504519537457
31) 0.6231414068325036


KeyboardInterrupt: 

In [11]:
len(metrics), np.mean(metrics), np.std(metrics) / np.sqrt(len(metrics))

(32, 0.6194680589596651, 0.00084905946857444)

In [None]:
predictions = np.array([
    model.predict_proba(features)[:, 1]
    for model in models
])

In [None]:
i = 0
counter = 0
while counter < 10:
    if np.argmax(predictions[:, i]) == test_positions[i]:
        axis = plt.gca()
        plt.plot(predictions[:,i], color='g' if test_targets[i] > 0 else 'r')
        plt.plot([test_positions[i]] * 2 ,[0, 1], color='b')
        axis.set_ylim(0, 1)
        axis.set_xlim(-0.1, 10.1)
        plt.show()
        counter += 1
    i += 1