In [1]:
import numpy as np
import shutil
import os
from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier
import sys

sys.path.append("../utils/")
from constants import POSITIONS_VARIANTS
sys.path.append("../calculating_predictions")
from calculate_predictions import calculate_predictions
sys.path.append("../evaluation/")
from evaluate import evaluate
from run_evaluation import *

%reload_ext autoreload
%autoreload 2

In [2]:
def greedy_positions(predictions):
    predictions = np.argmax(predictions, axis=2)
    def get_first_one(prediction):
        for i, item in enumerate(prediction):
            if item == 1:
                return i if i != 10 else 100
        return 100
    return np.array([
        get_first_one(prediction)
        for prediction in predictions
    ])


def smart_positions(predictions):
    predictions = predictions[:, :, 1]
    return np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]


def smart_positions_with_threshold(predictions, threshold):
    predictions = predictions[:, :, 1]
    positions = np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]
    positions[np.argmax(predictions, axis=1) < threshold] = 100
    return positions


def get_greedy_metric(threshold, positions):
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = -1
    args.data_folder = "../../../data/best_features_days_data/"
    args.out_folder = "res"
    args.type = "binary_classification"
    args.need_position_feature = True
    args.labels_to_substruct = None
    args.metric_folder = "metrics"
    args.model_constructor = lambda verbose: CatBoostClassifier(verbose=False)
    args.additional_features = {
        day: [
            os.path.join("features_models_with_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_with_pos", str(i)))
        ]
        for day in [3, 6]
    }
    args.train_days = [3]
    args.validation_day = None
    args.test_days = [6]
    args.threshold = threshold
    
    def clear(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, positions)
    
    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
%%time
from pool_iterator import pool_iterator
targets_sum = 0
targets_count = 0
for item in pool_iterator("../../../data/pool.json"):
    targets_count += 1
    targets_sum += item["target"]
average_target = targets_sum / targets_count

CPU times: user 46.2 s, sys: 380 ms, total: 46.6 s
Wall time: 46.7 s


In [4]:
average_target

0.6187247619522409

In [5]:
metrics = [get_greedy_metric(average_target, greedy_positions) for i in range(15)]

"2018-06-21 00:08:11.525194": preprocesing started
"2018-06-21 00:08:12.299079":     base features shape: (20203, 80)
"2018-06-21 00:08:12.301092":     0 features added
"2018-06-21 00:08:12.301144":     result shape: (20203, 80)
"2018-06-21 00:08:12.303824": train features shape: (20203, 80)
"2018-06-21 00:08:12.888818": preprocesing finished
"2018-06-21 00:08:12.888904": start training on days [3]
"2018-06-21 00:08:12.889055": using fit without validation
"2018-06-21 00:08:35.566704": built 1000 trees
"2018-06-21 00:08:37.769618":     base features shape: (247423, 80)
"2018-06-21 00:08:37.782301":     0 features added
"2018-06-21 00:08:37.782368":     result shape: (247423, 80)
"2018-06-21 00:08:37.782779": days to predict: [6]
"2018-06-21 00:08:37.782808": features shape: (247423, 80)
"2018-06-21 00:08:37.782908": start predicting on day 6
"2018-06-21 00:08:39.238766": predictions shape: (22493, 11, 2)
"2018-06-21 00:08:39.238834": saveing results to res/train_3_test_6
"2018-06-21 00

"2018-06-21 00:12:16.803332": preprocesing started
"2018-06-21 00:12:17.686788":     base features shape: (20203, 80)
"2018-06-21 00:12:17.689442":     0 features added
"2018-06-21 00:12:17.689499":     result shape: (20203, 80)
"2018-06-21 00:12:17.690732": train features shape: (20203, 80)
"2018-06-21 00:12:18.374646": preprocesing finished
"2018-06-21 00:12:18.374719": start training on days [3]
"2018-06-21 00:12:18.375103": using fit without validation
"2018-06-21 00:12:44.003484": built 1000 trees
"2018-06-21 00:12:46.259913":     base features shape: (247423, 80)
"2018-06-21 00:12:46.272022":     0 features added
"2018-06-21 00:12:46.272061":     result shape: (247423, 80)
"2018-06-21 00:12:46.272415": days to predict: [6]
"2018-06-21 00:12:46.272441": features shape: (247423, 80)
"2018-06-21 00:12:46.272456": start predicting on day 6
"2018-06-21 00:12:47.763404": predictions shape: (22493, 11, 2)
"2018-06-21 00:12:47.763485": saveing results to res/train_3_test_6
"2018-06-21 00

In [6]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6101406990646888, 0.0012785364607462897)

In [7]:
metrics = [get_greedy_metric(average_target, smart_positions) for i in range(15)]

"2018-06-21 00:16:09.768760": preprocesing started
"2018-06-21 00:16:10.614897":     base features shape: (20203, 80)
"2018-06-21 00:16:10.617639":     0 features added
"2018-06-21 00:16:10.617682":     result shape: (20203, 80)
"2018-06-21 00:16:10.618671": train features shape: (20203, 80)
"2018-06-21 00:16:11.241981": preprocesing finished
"2018-06-21 00:16:11.242078": start training on days [3]
"2018-06-21 00:16:11.242094": using fit without validation
"2018-06-21 00:16:36.619643": built 1000 trees
"2018-06-21 00:16:38.781625":     base features shape: (247423, 80)
"2018-06-21 00:16:38.793908":     0 features added
"2018-06-21 00:16:38.793971":     result shape: (247423, 80)
"2018-06-21 00:16:38.794483": days to predict: [6]
"2018-06-21 00:16:38.794530": features shape: (247423, 80)
"2018-06-21 00:16:38.794561": start predicting on day 6
"2018-06-21 00:16:40.338038": predictions shape: (22493, 11, 2)
"2018-06-21 00:16:40.338117": saveing results to res/train_3_test_6
"2018-06-21 00

"2018-06-21 00:20:34.194704": preprocesing started
"2018-06-21 00:20:34.937660":     base features shape: (20203, 80)
"2018-06-21 00:20:34.940248":     0 features added
"2018-06-21 00:20:34.940302":     result shape: (20203, 80)
"2018-06-21 00:20:34.941395": train features shape: (20203, 80)
"2018-06-21 00:20:35.553914": preprocesing finished
"2018-06-21 00:20:35.553982": start training on days [3]
"2018-06-21 00:20:35.553998": using fit without validation
"2018-06-21 00:20:59.293041": built 1000 trees
"2018-06-21 00:21:01.567727":     base features shape: (247423, 80)
"2018-06-21 00:21:01.579986":     0 features added
"2018-06-21 00:21:01.580026":     result shape: (247423, 80)
"2018-06-21 00:21:01.580388": days to predict: [6]
"2018-06-21 00:21:01.580415": features shape: (247423, 80)
"2018-06-21 00:21:01.580430": start predicting on day 6
"2018-06-21 00:21:03.095793": predictions shape: (22493, 11, 2)
"2018-06-21 00:21:03.095890": saveing results to res/train_3_test_6
"2018-06-21 00

In [8]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6206963085727446, 0.0036245474316436315)

In [3]:
metrics = [
    get_greedy_metric(
        average_target,
        lambda predictions: smart_positions_with_threshold(predictions, 0.5))
    for i in range(15)
]

NameError: name 'average_target' is not defined

In [11]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6283399258104762, 0.005144701815914147)

In [15]:
res = {}
for th in np.linspace(0.1, 0.9, 10):
    metrics = [
        get_greedy_metric(
            average_target,
            lambda predictions: smart_positions_with_threshold(predictions, th))
        for i in range(5)
    ]
    res[th] = (np.mean(metrics), np.std(metrics) / np.sqrt(10))

"2018-06-21 00:59:10.256961": preprocesing started
"2018-06-21 00:59:11.036086":     base features shape: (20203, 80)
"2018-06-21 00:59:11.038292":     0 features added
"2018-06-21 00:59:11.038367":     result shape: (20203, 80)
"2018-06-21 00:59:11.040218": train features shape: (20203, 80)
"2018-06-21 00:59:11.664569": preprocesing finished
"2018-06-21 00:59:11.664673": start training on days [3]
"2018-06-21 00:59:11.664778": using fit without validation
"2018-06-21 00:59:34.342034": built 1000 trees
"2018-06-21 00:59:36.382658":     base features shape: (247423, 80)
"2018-06-21 00:59:36.395071":     0 features added
"2018-06-21 00:59:36.395111":     result shape: (247423, 80)
"2018-06-21 00:59:36.395507": days to predict: [6]
"2018-06-21 00:59:36.395534": features shape: (247423, 80)
"2018-06-21 00:59:36.395550": start predicting on day 6
"2018-06-21 00:59:37.864116": predictions shape: (22493, 11, 2)
"2018-06-21 00:59:37.864182": saveing results to res/train_3_test_6
"2018-06-21 00

"2018-06-21 01:03:08.688886": preprocesing started
"2018-06-21 01:03:09.413852":     base features shape: (20203, 80)
"2018-06-21 01:03:09.415009":     0 features added
"2018-06-21 01:03:09.415051":     result shape: (20203, 80)
"2018-06-21 01:03:09.416110": train features shape: (20203, 80)
"2018-06-21 01:03:09.977529": preprocesing finished
"2018-06-21 01:03:09.977623": start training on days [3]
"2018-06-21 01:03:09.977727": using fit without validation
"2018-06-21 01:03:32.402841": built 1000 trees
"2018-06-21 01:03:34.404673":     base features shape: (247423, 80)
"2018-06-21 01:03:34.416731":     0 features added
"2018-06-21 01:03:34.416762":     result shape: (247423, 80)
"2018-06-21 01:03:34.417141": days to predict: [6]
"2018-06-21 01:03:34.417170": features shape: (247423, 80)
"2018-06-21 01:03:34.417185": start predicting on day 6
"2018-06-21 01:03:35.865639": predictions shape: (22493, 11, 2)
"2018-06-21 01:03:35.865733": saveing results to res/train_3_test_6
"2018-06-21 01

"2018-06-21 01:07:09.911732": preprocesing started
"2018-06-21 01:07:10.732189":     base features shape: (20203, 80)
"2018-06-21 01:07:10.733469":     0 features added
"2018-06-21 01:07:10.733515":     result shape: (20203, 80)
"2018-06-21 01:07:10.734618": train features shape: (20203, 80)
"2018-06-21 01:07:11.369246": preprocesing finished
"2018-06-21 01:07:11.369321": start training on days [3]
"2018-06-21 01:07:11.369337": using fit without validation
"2018-06-21 01:07:34.549687": built 1000 trees
"2018-06-21 01:07:36.622317":     base features shape: (247423, 80)
"2018-06-21 01:07:36.634530":     0 features added
"2018-06-21 01:07:36.634563":     result shape: (247423, 80)
"2018-06-21 01:07:36.634944": days to predict: [6]
"2018-06-21 01:07:36.634968": features shape: (247423, 80)
"2018-06-21 01:07:36.634982": start predicting on day 6
"2018-06-21 01:07:38.104702": predictions shape: (22493, 11, 2)
"2018-06-21 01:07:38.104782": saveing results to res/train_3_test_6
"2018-06-21 01

"2018-06-21 01:11:17.654259": preprocesing started
"2018-06-21 01:11:18.378355":     base features shape: (20203, 80)
"2018-06-21 01:11:18.379487":     0 features added
"2018-06-21 01:11:18.379516":     result shape: (20203, 80)
"2018-06-21 01:11:18.380588": train features shape: (20203, 80)
"2018-06-21 01:11:18.973012": preprocesing finished
"2018-06-21 01:11:18.973096": start training on days [3]
"2018-06-21 01:11:18.973249": using fit without validation
"2018-06-21 01:11:42.934940": built 1000 trees
"2018-06-21 01:11:45.115087":     base features shape: (247423, 80)
"2018-06-21 01:11:45.128100":     0 features added
"2018-06-21 01:11:45.128164":     result shape: (247423, 80)
"2018-06-21 01:11:45.128496": days to predict: [6]
"2018-06-21 01:11:45.128525": features shape: (247423, 80)
"2018-06-21 01:11:45.128541": start predicting on day 6
"2018-06-21 01:11:46.639260": predictions shape: (22493, 11, 2)
"2018-06-21 01:11:46.639354": saveing results to res/train_3_test_6
"2018-06-21 01

"2018-06-21 01:15:26.715004": preprocesing started
"2018-06-21 01:15:27.545386":     base features shape: (20203, 80)
"2018-06-21 01:15:27.546555":     0 features added
"2018-06-21 01:15:27.546605":     result shape: (20203, 80)
"2018-06-21 01:15:27.547773": train features shape: (20203, 80)
"2018-06-21 01:15:28.187408": preprocesing finished
"2018-06-21 01:15:28.187493": start training on days [3]
"2018-06-21 01:15:28.187511": using fit without validation
"2018-06-21 01:15:53.049240": built 1000 trees
"2018-06-21 01:15:55.188626":     base features shape: (247423, 80)
"2018-06-21 01:15:55.201773":     0 features added
"2018-06-21 01:15:55.201883":     result shape: (247423, 80)
"2018-06-21 01:15:55.202292": days to predict: [6]
"2018-06-21 01:15:55.202349": features shape: (247423, 80)
"2018-06-21 01:15:55.202383": start predicting on day 6
"2018-06-21 01:15:56.772203": predictions shape: (22493, 11, 2)
"2018-06-21 01:15:56.772286": saveing results to res/train_3_test_6
"2018-06-21 01

"2018-06-21 01:19:44.412433": preprocesing started
"2018-06-21 01:19:45.163560":     base features shape: (20203, 80)
"2018-06-21 01:19:45.164949":     0 features added
"2018-06-21 01:19:45.164997":     result shape: (20203, 80)
"2018-06-21 01:19:45.166083": train features shape: (20203, 80)
"2018-06-21 01:19:45.781170": preprocesing finished
"2018-06-21 01:19:45.781273": start training on days [3]
"2018-06-21 01:19:45.781294": using fit without validation
"2018-06-21 01:20:08.685775": built 1000 trees
"2018-06-21 01:20:10.796678":     base features shape: (247423, 80)
"2018-06-21 01:20:10.809095":     0 features added
"2018-06-21 01:20:10.809133":     result shape: (247423, 80)
"2018-06-21 01:20:10.809537": days to predict: [6]
"2018-06-21 01:20:10.809564": features shape: (247423, 80)
"2018-06-21 01:20:10.809579": start predicting on day 6
"2018-06-21 01:20:12.196629": predictions shape: (22493, 11, 2)
"2018-06-21 01:20:12.196709": saveing results to res/train_3_test_6
"2018-06-21 01

"2018-06-21 01:23:58.072348": preprocesing started
"2018-06-21 01:23:58.796073":     base features shape: (20203, 80)
"2018-06-21 01:23:58.797278":     0 features added
"2018-06-21 01:23:58.797324":     result shape: (20203, 80)
"2018-06-21 01:23:58.798496": train features shape: (20203, 80)
"2018-06-21 01:23:59.417697": preprocesing finished
"2018-06-21 01:23:59.417768": start training on days [3]
"2018-06-21 01:23:59.417785": using fit without validation
"2018-06-21 01:24:24.291491": built 1000 trees
"2018-06-21 01:24:26.510836":     base features shape: (247423, 80)
"2018-06-21 01:24:26.523088":     0 features added
"2018-06-21 01:24:26.523122":     result shape: (247423, 80)
"2018-06-21 01:24:26.523410": days to predict: [6]
"2018-06-21 01:24:26.523434": features shape: (247423, 80)
"2018-06-21 01:24:26.523449": start predicting on day 6
"2018-06-21 01:24:28.025197": predictions shape: (22493, 11, 2)
"2018-06-21 01:24:28.025260": saveing results to res/train_3_test_6
"2018-06-21 01

In [16]:
res

{0.1: (0.6355749236135495, 0.003162723271489616),
 0.18888888888888888: (0.6359131047916202, 0.004209740060774447),
 0.2777777777777778: (0.6290267875927288, 0.004640652431359883),
 0.3666666666666667: (0.622029501396152, 0.005567964267894078),
 0.4555555555555556: (0.6295784621436431, 0.004142436307717166),
 0.5444444444444445: (0.6378853378229242, 0.002472572099384837),
 0.6333333333333333: (0.6346278003512318, 0.0048027658295530665),
 0.7222222222222222: (0.6226123616463015, 0.003784136582314592),
 0.8111111111111111: (0.6308403814607049, 0.0034276639715340013),
 0.9: (0.6259451185491536, 0.004869667873788324)}

In [6]:
for key in res:
    res[key] = (res[key][0], res[key][1] * np.sqrt(10) / np.sqrt(5))

In [2]:
res = {0.1: (0.6355749236135495, 0.003162723271489616),
 0.18888888888888888: (0.6359131047916202, 0.004209740060774447),
 0.2777777777777778: (0.6290267875927288, 0.004640652431359883),
 0.3666666666666667: (0.622029501396152, 0.005567964267894078),
 0.4555555555555556: (0.6295784621436431, 0.004142436307717166),
 0.5444444444444445: (0.6378853378229242, 0.002472572099384837),
 0.6333333333333333: (0.6346278003512318, 0.0048027658295530665),
 0.7222222222222222: (0.6226123616463015, 0.003784136582314592),
 0.8111111111111111: (0.6308403814607049, 0.0034276639715340013),
 0.9: (0.6259451185491536, 0.004869667873788324)}

In [7]:
res

{0.1: (0.6355749236135495, 0.004472766144573619),
 0.18888888888888888: (0.6359131047916202, 0.00595347148801256),
 0.2777777777777778: (0.6290267875927288, 0.0065628736066888245),
 0.3666666666666667: (0.622029501396152, 0.007874290582464586),
 0.4555555555555556: (0.6295784621436431, 0.005858289607640344),
 0.5444444444444445: (0.6378853378229242, 0.0034967449968953527),
 0.6333333333333333: (0.6346278003512318, 0.006792136573056016),
 0.7222222222222222: (0.6226123616463015, 0.0053515772765814675),
 0.8111111111111111: (0.6308403814607049, 0.0048474488758010114),
 0.9: (0.6259451185491536, 0.006886750351364002)}