In [1]:
import numpy as np
import os
import sys
from catboost import CatBoostClassifier
from collections import Counter
sys.path.append("../utils")
from pool_iterator import pool_iterator
from metric import calculate_metric
from json_tools import get_from_pool
from itertools import chain

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train_days_filenames = [
    "../../../data/new_days_data_best_features/day_1.json",
    "../../../data/new_days_data_best_features/day_2.json",
    "../../../data/new_days_data_best_features/day_3.json"
]
test_day_filename = "../../../data/new_days_data_best_features/day_6.json"
position_variants = list(range(10)) + [100]

In [3]:
def make_features(filenames, position):
    return np.array([
        json["factors"]
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

def make_labels(filenames, position):
    return np.array([
        1 if json["target"] > 0 else 0
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

In [4]:
%%time
train_features = [
    make_features(train_days_filenames, position)
    for position in position_variants
]
train_labels = [
    make_labels(train_days_filenames, position)
    for position in position_variants
]

CPU times: user 3min 19s, sys: 4.5 s, total: 3min 23s
Wall time: 3min 23s


In [5]:
def get_positions(features, models):
    predictions = np.array([
        model.predict_proba(features)[:, 1]
        for model in models
    ])
    return np.array(position_variants)[np.argmax(predictions, axis=0)]

In [6]:
%%time
test_features = get_from_pool(pool_iterator(test_day_filename), "factors", np.array)
test_targets = get_from_pool(pool_iterator(test_day_filename), "target", float)
test_probas = get_from_pool(pool_iterator(test_day_filename), "p", float)
test_positions = get_from_pool(pool_iterator(test_day_filename), "pos", int)

CPU times: user 12.8 s, sys: 250 ms, total: 13 s
Wall time: 13 s


In [9]:
def train_models():
    models = [CatBoostClassifier(verbose=False) for position in position_variants]
    for features, labels, model in zip(train_features, train_labels, models):
        model.fit(features, labels)
    return models

In [10]:
def test_models(models):
    answers = get_positions(test_features, models)
    return calculate_metric(answers, test_positions, test_targets, test_probas)

In [11]:
metrics = []

In [12]:
for i in range(1000):
    metric = test_models(train_models())
    print('{}) {}', i, metric)
    metrics.append(metric)

{}) {} 0 0.6235086667848789
{}) {} 1 0.6165921386116425
{}) {} 2 0.5990968617811241
{}) {} 3 0.6108564108983168
{}) {} 4 0.5991517266815154
{}) {} 5 0.6104282768527077
{}) {} 6 0.609809959226088
{}) {} 7 0.6101799393745366
{}) {} 8 0.6081028605790718
{}) {} 9 0.6151072753898912
{}) {} 10 0.6105590698884621
{}) {} 11 0.6107738557056039
{}) {} 12 0.611756148604108
{}) {} 13 0.6126167363056856
{}) {} 14 0.6241679861076052
{}) {} 15 0.6052778349762308
{}) {} 16 0.6055342965244509
{}) {} 17 0.6059249551280406
{}) {} 18 0.6106494241264555
{}) {} 19 0.618169106849541
{}) {} 20 0.6094326910511175
{}) {} 21 0.6192695930647792
{}) {} 22 0.6227922320072775
{}) {} 23 0.6112703702516264
{}) {} 24 0.6112053952867875
{}) {} 25 0.6140251883839338
{}) {} 26 0.6097759934466819
{}) {} 27 0.6127715434883715
{}) {} 28 0.6160225896873328


KeyboardInterrupt: 

In [13]:
len(metrics), np.mean(metrics), np.std(metrics) / np.sqrt(len(metrics))

(29, 0.6118906595539264, 0.001112168105547396)

In [None]:
predictions = np.array([
    model.predict_proba(features)[:, 1]
    for model in models
])

In [None]:
i = 0
counter = 0
while counter < 10:
    if np.argmax(predictions[:, i]) == test_positions[i]:
        axis = plt.gca()
        plt.plot(predictions[:,i], color='g' if test_targets[i] > 0 else 'r')
        plt.plot([test_positions[i]] * 2 ,[0, 1], color='b')
        axis.set_ylim(0, 1)
        axis.set_xlim(-0.1, 10.1)
        plt.show()
        counter += 1
    i += 1