In [1]:
import numpy as np
import os
import sys
from catboost import CatBoostClassifier
from collections import Counter
sys.path.append("../utils")
from pool_iterator import pool_iterator
from metric import calculate_metric
from json_tools import get_from_pool
from itertools import chain

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train_days_filenames = [
    "../../../data/new_days_data_best_features/day_1.json",
    "../../../data/new_days_data_best_features/day_2.json",
    "../../../data/new_days_data_best_features/day_3.json"
][:2]
test_day_filename = "../../../data/new_days_data_best_features/day_6.json"
position_variants = list(range(10)) + [100]

In [3]:
def make_features(filenames, position):
    return np.array([
        json["factors"]
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

def make_labels(filenames, position):
    return np.array([
        1 if json["target"] > 0 else 0
        for json in chain.from_iterable([pool_iterator(filename) for filename in filenames])
        if json["pos"] == position
    ])

In [4]:
%%time
train_features = [
    make_features(train_days_filenames, position)
    for position in position_variants
]
train_labels = [
    make_labels(train_days_filenames, position)
    for position in position_variants
]

CPU times: user 4min 10s, sys: 3.95 s, total: 4min 14s
Wall time: 4min 14s


In [5]:
def get_positions(features, models):
    predictions = np.array([
        model.predict_proba(features)[:, 1]
        for model in models
    ])
    return np.array(position_variants)[np.argmax(predictions, axis=0)]

In [6]:
%%time
test_features = get_from_pool(pool_iterator(test_day_filename), "factors", np.array)
test_targets = get_from_pool(pool_iterator(test_day_filename), "target", float)
test_probas = get_from_pool(pool_iterator(test_day_filename), "p", float)
test_positions = get_from_pool(pool_iterator(test_day_filename), "pos", int)

CPU times: user 24 s, sys: 460 ms, total: 24.5 s
Wall time: 24.6 s


In [7]:
def train_models():
    models = [CatBoostClassifier(verbose=False) for position in position_variants]
    for features, labels, model in zip(train_features, train_labels, models):
        model.fit(features, labels)
    return models

In [8]:
def test_models(models):
    answers = get_positions(test_features, models)
    return calculate_metric(answers, test_positions, test_targets, test_probas)

In [9]:
metrics = []

In [10]:
for i in range(1000):
    metric = test_models(train_models())
    print('{}) {}', i, metric)
    metrics.append(metric)

{}) {} 0 0.6193188724510231
{}) {} 1 0.6031349071598254
{}) {} 2 0.6129050482681605
{}) {} 3 0.6041849670153413
{}) {} 4 0.620363289754022
{}) {} 5 0.614091893699567
{}) {} 6 0.6089443687802595
{}) {} 7 0.6200394816133161
{}) {} 8 0.6198258462112811
{}) {} 9 0.6084825601720126
{}) {} 10 0.6060898978156456
{}) {} 11 0.6153274846641478
{}) {} 12 0.6178839593845692
{}) {} 13 0.620991360539514
{}) {} 14 0.6114502487921437
{}) {} 15 0.6207831249929894
{}) {} 16 0.6189418090154217
{}) {} 17 0.6144203408743376
{}) {} 18 0.6181527964722745
{}) {} 19 0.6142914981426795
{}) {} 20 0.6220327141294522
{}) {} 21 0.6149348278906017
{}) {} 22 0.6148339997384918
{}) {} 23 0.6104850787020872
{}) {} 24 0.6097779636974664
{}) {} 25 0.610361733347695
{}) {} 26 0.6247176099119595
{}) {} 27 0.606706666973592
{}) {} 28 0.6122302082792046


KeyboardInterrupt: 

In [11]:
len(metrics), np.mean(metrics), np.std(metrics) / np.sqrt(len(metrics))

(29, 0.6143346399478994, 0.0010407765292470188)

In [None]:
predictions = np.array([
    model.predict_proba(features)[:, 1]
    for model in models
])

In [None]:
i = 0
counter = 0
while counter < 10:
    if np.argmax(predictions[:, i]) == test_positions[i]:
        axis = plt.gca()
        plt.plot(predictions[:,i], color='g' if test_targets[i] > 0 else 'r')
        plt.plot([test_positions[i]] * 2 ,[0, 1], color='b')
        axis.set_ylim(0, 1)
        axis.set_xlim(-0.1, 10.1)
        plt.show()
        counter += 1
    i += 1