In [1]:
import numpy as np
import shutil
import os
from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier
import sys

sys.path.append("../utils/")
from constants import POSITIONS_VARIANTS
sys.path.append("../calculating_predictions")
from calculate_predictions import calculate_predictions
sys.path.append("../evaluation/")
from evaluate import evaluate
from run_evaluation import *

FEATURES_NUMBER = 16

%reload_ext autoreload
%autoreload 2

In [2]:
def greedy_positions(predictions):
    predictions = np.argmax(predictions, axis=2)
    def get_first_one(prediction):
        for i, item in enumerate(prediction):
            if item == 1:
                return i if i != 10 else 100
        return 100
    return np.array([
        get_first_one(prediction)
        for prediction in predictions
    ])


def smart_positions(predictions):
    predictions = predictions[:, :, 1]
    return np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]


def smart_positions_with_threshold(predictions, threshold):
    predictions = predictions[:, :, 1]
    positions = np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]
    positions[np.argmax(predictions, axis=1) < threshold] = 8
    return positions

def get_greedy_metric(i, threshold, positions, stacking=False, add_base_features=True):
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = FEATURES_NUMBER
    args.data_folder = "../../../data/pool_with_queries/train_test_split/"
    args.out_folder = "res"
    args.type = "binary_classification"
    args.labels_to_substruct = None
    args.position_features_num = 1
    args.metric_folder = "metrics1"
    args.model_constructor = lambda verbose: CatBoostClassifier(verbose=False)
    args.additional_features = {
        day: [
            os.path.join("features_models_single_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_single_pos", str(i)))
        ]
        for day in [2, 3]
    }
    if not stacking:
        args.additional_features = None
    args.add_base_features = add_base_features
    args.train_days = [2]
    args.validation_day = None
    args.test_days = [3]
    args.threshold = threshold
    
    def clear(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, positions)
    
    with open("metrics1/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
average_target = 0.6118888430078808

In [5]:
metrics = [get_greedy_metric(i, average_target, greedy_positions) for i in range(10)]

 "2018-06-26 02:44:32.037557": preprocesing started
 "2018-06-26 02:44:33.810955":     base features shape: (142276, 17)
 "2018-06-26 02:44:33.814602":     0 features added
 "2018-06-26 02:44:33.814648":     base features included
 "2018-06-26 02:44:33.814669":     result shape: (142276, 17)
 "2018-06-26 02:44:33.818203": train features shape: (142276, 17)
 "2018-06-26 02:44:35.266532": preprocesing finished
 "2018-06-26 02:44:35.266625": start training on days [2]
 "2018-06-26 02:44:35.266660": using fit without validation
 "2018-06-26 02:45:04.646797": built 1000 trees
 "2018-06-26 02:45:04.646917": saving model in path "tmp_815901/tmp_model_trained_on_2"
 "2018-06-26 02:45:09.171341":     base features shape: (1565036, 17)
 "2018-06-26 02:45:09.202142":     0 features added
 "2018-06-26 02:45:09.202209":     base features included
 "2018-06-26 02:45:09.202232":     result shape: (1565036, 17)
 "2018-06-26 02:45:09.204260": days to predict: [3]
 "2018-06-26 02:45:09.204315": loaded m

 "2018-06-26 02:49:08.145068": preprocesing finished
 "2018-06-26 02:49:08.145169": start training on days [2]
 "2018-06-26 02:49:08.145190": using fit without validation
 "2018-06-26 02:49:35.994919": built 1000 trees
 "2018-06-26 02:49:35.995028": saving model in path "tmp_538627/tmp_model_trained_on_2"
 "2018-06-26 02:49:40.532979":     base features shape: (1565036, 17)
 "2018-06-26 02:49:40.604992":     0 features added
 "2018-06-26 02:49:40.605061":     base features included
 "2018-06-26 02:49:40.605099":     result shape: (1565036, 17)
 "2018-06-26 02:49:40.607278": days to predict: [3]
 "2018-06-26 02:49:40.607373": loaded model from: tmp_538627/tmp_model_trained_on_2
 "2018-06-26 02:49:40.607529": features shape: (1565036, 17)
 "2018-06-26 02:49:40.607558": start predicting on day 3
 "2018-06-26 02:49:44.296621": predictions shape: (142276, 11, 2)
 "2018-06-26 02:49:44.296702": saveing results to res/train_2_test_3
 "2018-06-26 02:49:44.428716": results saved
 "2018-06-26 02:

In [6]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.5847823932243758, 0.0004937295397864847)

In [7]:
# stacking
metrics = [get_greedy_metric(i, average_target, smart_positions, True, True) for i in range(10)]

 "2018-06-26 02:52:04.353470": preprocesing started
 "2018-06-26 02:52:06.002040":     base features shape: (142276, 17)
 "2018-06-26 02:52:11.061065":     28 features added
 "2018-06-26 02:52:11.061156":     base features included
 "2018-06-26 02:52:11.061182":     result shape: (142276, 45)
 "2018-06-26 02:52:11.071936": train features shape: (142276, 45)
 "2018-06-26 02:52:12.525042": preprocesing finished
 "2018-06-26 02:52:12.525138": start training on days [2]
 "2018-06-26 02:52:12.525252": using fit without validation
 "2018-06-26 02:52:53.098403": built 1000 trees
 "2018-06-26 02:52:53.098513": saving model in path "tmp_585377/tmp_model_trained_on_2"
 "2018-06-26 02:52:57.838615":     base features shape: (1565036, 17)
 "2018-06-26 02:53:19.152400":     28 features added
 "2018-06-26 02:53:19.152499":     base features included
 "2018-06-26 02:53:19.152527":     result shape: (1565036, 45)
 "2018-06-26 02:53:19.225295": days to predict: [3]
 "2018-06-26 02:53:19.225390": loaded

 "2018-06-26 03:00:31.977658": preprocesing finished
 "2018-06-26 03:00:31.977743": start training on days [2]
 "2018-06-26 03:00:31.977766": using fit without validation
 "2018-06-26 03:01:09.304955": built 1000 trees
 "2018-06-26 03:01:09.305035": saving model in path "tmp_940047/tmp_model_trained_on_2"
 "2018-06-26 03:01:13.734839":     base features shape: (1565036, 17)
 "2018-06-26 03:01:35.031283":     28 features added
 "2018-06-26 03:01:35.031375":     base features included
 "2018-06-26 03:01:35.031403":     result shape: (1565036, 45)
 "2018-06-26 03:01:35.059516": days to predict: [3]
 "2018-06-26 03:01:35.059599": loaded model from: tmp_940047/tmp_model_trained_on_2
 "2018-06-26 03:01:35.059641": features shape: (1565036, 45)
 "2018-06-26 03:01:35.059750": start predicting on day 3
 "2018-06-26 03:01:41.376871": predictions shape: (142276, 11, 2)
 "2018-06-26 03:01:41.376949": saveing results to res/train_2_test_3
 "2018-06-26 03:01:41.480569": results saved
 "2018-06-26 03

In [8]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.6198781838638694, 0.0020780924744599905)

In [9]:
# stacking no base features
metrics = [get_greedy_metric(i, average_target, smart_positions, True, False) for i in range(10)]

 "2018-06-26 03:05:50.155438": preprocesing started
 "2018-06-26 03:05:51.907926":     base features shape: (142276, 17)
 "2018-06-26 03:05:55.802982":     28 features added
 "2018-06-26 03:05:55.803090":     base features not included
 "2018-06-26 03:05:55.803117":     result shape: (142276, 28)
 "2018-06-26 03:05:55.805736": train features shape: (142276, 28)
 "2018-06-26 03:05:57.123716": preprocesing finished
 "2018-06-26 03:05:57.123806": start training on days [2]
 "2018-06-26 03:05:57.123900": using fit without validation
 "2018-06-26 03:06:28.375398": built 1000 trees
 "2018-06-26 03:06:28.375474": saving model in path "tmp_190953/tmp_model_trained_on_2"
 "2018-06-26 03:06:33.026320":     base features shape: (1565036, 17)
 "2018-06-26 03:06:54.721091":     28 features added
 "2018-06-26 03:06:54.721200":     base features not included
 "2018-06-26 03:06:54.721419":     result shape: (1565036, 28)
 "2018-06-26 03:06:54.749580": days to predict: [3]
 "2018-06-26 03:06:54.749650"

 "2018-06-26 03:13:21.053515":     28 features added
 "2018-06-26 03:13:21.053614":     base features not included
 "2018-06-26 03:13:21.053644":     result shape: (142276, 28)
 "2018-06-26 03:13:21.068598": train features shape: (142276, 28)
 "2018-06-26 03:13:22.450476": preprocesing finished
 "2018-06-26 03:13:22.450600": start training on days [2]
 "2018-06-26 03:13:22.450697": using fit without validation
 "2018-06-26 03:13:55.668395": built 1000 trees
 "2018-06-26 03:13:55.668500": saving model in path "tmp_797809/tmp_model_trained_on_2"
 "2018-06-26 03:14:00.738413":     base features shape: (1565036, 17)
 "2018-06-26 03:14:22.026636":     28 features added
 "2018-06-26 03:14:22.026732":     base features not included
 "2018-06-26 03:14:22.026760":     result shape: (1565036, 28)
 "2018-06-26 03:14:22.061525": days to predict: [3]
 "2018-06-26 03:14:22.061712": loaded model from: tmp_797809/tmp_model_trained_on_2
 "2018-06-26 03:14:22.061815": features shape: (1565036, 28)
 "201

In [11]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.6140630566710301, 0.0011831280032183562)

In [12]:
metrics = [get_greedy_metric(i, average_target, smart_positions) for i in range(10)]

 "2018-06-26 03:18:10.575876": preprocesing started
 "2018-06-26 03:18:12.245415":     base features shape: (142276, 17)
 "2018-06-26 03:18:12.247164":     0 features added
 "2018-06-26 03:18:12.247208":     base features included
 "2018-06-26 03:18:12.247295":     result shape: (142276, 17)
 "2018-06-26 03:18:12.248825": train features shape: (142276, 17)
 "2018-06-26 03:18:13.704663": preprocesing finished
 "2018-06-26 03:18:13.704759": start training on days [2]
 "2018-06-26 03:18:13.704775": using fit without validation
 "2018-06-26 03:18:42.651887": built 1000 trees
 "2018-06-26 03:18:42.651960": saving model in path "tmp_845797/tmp_model_trained_on_2"
 "2018-06-26 03:18:47.517582":     base features shape: (1565036, 17)
 "2018-06-26 03:18:47.553653":     0 features added
 "2018-06-26 03:18:47.553750":     base features included
 "2018-06-26 03:18:47.553888":     result shape: (1565036, 17)
 "2018-06-26 03:18:47.556929": days to predict: [3]
 "2018-06-26 03:18:47.557015": loaded m

 "2018-06-26 03:22:22.260546": preprocesing finished
 "2018-06-26 03:22:22.260632": start training on days [2]
 "2018-06-26 03:22:22.260739": using fit without validation
 "2018-06-26 03:22:48.854373": built 1000 trees
 "2018-06-26 03:22:48.854478": saving model in path "tmp_44011/tmp_model_trained_on_2"
 "2018-06-26 03:22:53.144048":     base features shape: (1565036, 17)
 "2018-06-26 03:22:53.175056":     0 features added
 "2018-06-26 03:22:53.175141":     base features included
 "2018-06-26 03:22:53.175263":     result shape: (1565036, 17)
 "2018-06-26 03:22:53.177337": days to predict: [3]
 "2018-06-26 03:22:53.177412": loaded model from: tmp_44011/tmp_model_trained_on_2
 "2018-06-26 03:22:53.177433": features shape: (1565036, 17)
 "2018-06-26 03:22:53.177484": start predicting on day 3
 "2018-06-26 03:22:56.496890": predictions shape: (142276, 11, 2)
 "2018-06-26 03:22:56.496967": saveing results to res/train_2_test_3
 "2018-06-26 03:22:56.596892": results saved
 "2018-06-26 03:23

In [13]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.6171324315137259, 0.0006667454943543727)

In [15]:
res = {}
for th in np.linspace(0.1, 0.9, 10):
    metrics = [
        get_greedy_metric(
            1,
            average_target,
            lambda predictions: smart_positions_with_threshold(predictions, th))
        for i in range(5)
    ]
    res[th] = (np.mean(metrics), np.std(metrics) / np.sqrt(5))

 "2018-06-26 03:26:11.388689": preprocesing started
 "2018-06-26 03:26:13.075211":     base features shape: (142276, 17)
 "2018-06-26 03:26:13.077030":     0 features added
 "2018-06-26 03:26:13.077063":     base features included
 "2018-06-26 03:26:13.077245":     result shape: (142276, 17)
 "2018-06-26 03:26:13.079098": train features shape: (142276, 17)
 "2018-06-26 03:26:14.415425": preprocesing finished
 "2018-06-26 03:26:14.415516": start training on days [2]
 "2018-06-26 03:26:14.415550": using fit without validation
 "2018-06-26 03:26:41.633341": built 1000 trees
 "2018-06-26 03:26:41.633414": saving model in path "tmp_814758/tmp_model_trained_on_2"
 "2018-06-26 03:26:46.141330":     base features shape: (1565036, 17)
 "2018-06-26 03:26:46.172873":     0 features added
 "2018-06-26 03:26:46.172939":     base features included
 "2018-06-26 03:26:46.172961":     result shape: (1565036, 17)
 "2018-06-26 03:26:46.174755": days to predict: [3]
 "2018-06-26 03:26:46.174811": loaded m

 "2018-06-26 03:30:29.192792": preprocesing finished
 "2018-06-26 03:30:29.192886": start training on days [2]
 "2018-06-26 03:30:29.192919": using fit without validation
 "2018-06-26 03:30:56.359148": built 1000 trees
 "2018-06-26 03:30:56.359223": saving model in path "tmp_836759/tmp_model_trained_on_2"
 "2018-06-26 03:31:00.966973":     base features shape: (1565036, 17)
 "2018-06-26 03:31:00.998261":     0 features added
 "2018-06-26 03:31:00.998344":     base features included
 "2018-06-26 03:31:00.998366":     result shape: (1565036, 17)
 "2018-06-26 03:31:01.000466": days to predict: [3]
 "2018-06-26 03:31:01.000541": loaded model from: tmp_836759/tmp_model_trained_on_2
 "2018-06-26 03:31:01.000671": features shape: (1565036, 17)
 "2018-06-26 03:31:01.000717": start predicting on day 3
 "2018-06-26 03:31:04.505844": predictions shape: (142276, 11, 2)
 "2018-06-26 03:31:04.505936": saveing results to res/train_2_test_3
 "2018-06-26 03:31:04.609654": results saved
 "2018-06-26 03:

 "2018-06-26 03:35:10.582365": built 1000 trees
 "2018-06-26 03:35:10.582456": saving model in path "tmp_696059/tmp_model_trained_on_2"
 "2018-06-26 03:35:15.087801":     base features shape: (1565036, 17)
 "2018-06-26 03:35:15.119349":     0 features added
 "2018-06-26 03:35:15.119491":     base features included
 "2018-06-26 03:35:15.119617":     result shape: (1565036, 17)
 "2018-06-26 03:35:15.121578": days to predict: [3]
 "2018-06-26 03:35:15.121636": loaded model from: tmp_696059/tmp_model_trained_on_2
 "2018-06-26 03:35:15.121657": features shape: (1565036, 17)
 "2018-06-26 03:35:15.121673": start predicting on day 3
 "2018-06-26 03:35:18.540952": predictions shape: (142276, 11, 2)
 "2018-06-26 03:35:18.541029": saveing results to res/train_2_test_3
 "2018-06-26 03:35:18.645326": results saved
 "2018-06-26 03:35:22.620875": preprocesing started
 "2018-06-26 03:35:24.318576":     base features shape: (142276, 17)
 "2018-06-26 03:35:24.320540":     0 features added
 "2018-06-26 0

 "2018-06-26 03:39:28.949098":     result shape: (1565036, 17)
 "2018-06-26 03:39:28.950929": days to predict: [3]
 "2018-06-26 03:39:28.950985": loaded model from: tmp_361426/tmp_model_trained_on_2
 "2018-06-26 03:39:28.951006": features shape: (1565036, 17)
 "2018-06-26 03:39:28.951101": start predicting on day 3
 "2018-06-26 03:39:32.343533": predictions shape: (142276, 11, 2)
 "2018-06-26 03:39:32.343609": saveing results to res/train_2_test_3
 "2018-06-26 03:39:32.449523": results saved
 "2018-06-26 03:39:36.461051": preprocesing started
 "2018-06-26 03:39:38.180439":     base features shape: (142276, 17)
 "2018-06-26 03:39:38.182075":     0 features added
 "2018-06-26 03:39:38.182100":     base features included
 "2018-06-26 03:39:38.182117":     result shape: (142276, 17)
 "2018-06-26 03:39:38.183601": train features shape: (142276, 17)
 "2018-06-26 03:39:39.527021": preprocesing finished
 "2018-06-26 03:39:39.527107": start training on days [2]
 "2018-06-26 03:39:39.527140": us

 "2018-06-26 03:43:50.762094": preprocesing started
 "2018-06-26 03:43:52.434586":     base features shape: (142276, 17)
 "2018-06-26 03:43:52.436440":     0 features added
 "2018-06-26 03:43:52.436523":     base features included
 "2018-06-26 03:43:52.436619":     result shape: (142276, 17)
 "2018-06-26 03:43:52.438335": train features shape: (142276, 17)
 "2018-06-26 03:43:53.766170": preprocesing finished
 "2018-06-26 03:43:53.766259": start training on days [2]
 "2018-06-26 03:43:53.766277": using fit without validation
 "2018-06-26 03:44:20.782890": built 1000 trees
 "2018-06-26 03:44:20.782983": saving model in path "tmp_883559/tmp_model_trained_on_2"
 "2018-06-26 03:44:25.528453":     base features shape: (1565036, 17)
 "2018-06-26 03:44:25.559175":     0 features added
 "2018-06-26 03:44:25.559222":     base features included
 "2018-06-26 03:44:25.559242":     result shape: (1565036, 17)
 "2018-06-26 03:44:25.561010": days to predict: [3]
 "2018-06-26 03:44:25.561065": loaded m

 "2018-06-26 03:48:07.991691": preprocesing finished
 "2018-06-26 03:48:07.991781": start training on days [2]
 "2018-06-26 03:48:07.991800": using fit without validation
 "2018-06-26 03:48:35.171236": built 1000 trees
 "2018-06-26 03:48:35.171307": saving model in path "tmp_303797/tmp_model_trained_on_2"
 "2018-06-26 03:48:39.698541":     base features shape: (1565036, 17)
 "2018-06-26 03:48:39.730185":     0 features added
 "2018-06-26 03:48:39.730269":     base features included
 "2018-06-26 03:48:39.730291":     result shape: (1565036, 17)
 "2018-06-26 03:48:39.732352": days to predict: [3]
 "2018-06-26 03:48:39.732428": loaded model from: tmp_303797/tmp_model_trained_on_2
 "2018-06-26 03:48:39.732449": features shape: (1565036, 17)
 "2018-06-26 03:48:39.732465": start predicting on day 3
 "2018-06-26 03:48:43.129193": predictions shape: (142276, 11, 2)
 "2018-06-26 03:48:43.129267": saveing results to res/train_2_test_3
 "2018-06-26 03:48:43.233738": results saved
 "2018-06-26 03:

 "2018-06-26 03:52:49.708790": built 1000 trees
 "2018-06-26 03:52:49.708862": saving model in path "tmp_282433/tmp_model_trained_on_2"
 "2018-06-26 03:52:54.264388":     base features shape: (1565036, 17)
 "2018-06-26 03:52:54.296048":     0 features added
 "2018-06-26 03:52:54.296117":     base features included
 "2018-06-26 03:52:54.296147":     result shape: (1565036, 17)
 "2018-06-26 03:52:54.298031": days to predict: [3]
 "2018-06-26 03:52:54.298091": loaded model from: tmp_282433/tmp_model_trained_on_2
 "2018-06-26 03:52:54.298118": features shape: (1565036, 17)
 "2018-06-26 03:52:54.298135": start predicting on day 3
 "2018-06-26 03:52:57.683234": predictions shape: (142276, 11, 2)
 "2018-06-26 03:52:57.683309": saveing results to res/train_2_test_3
 "2018-06-26 03:52:57.787204": results saved
 "2018-06-26 03:53:01.868099": preprocesing started
 "2018-06-26 03:53:03.600212":     base features shape: (142276, 17)
 "2018-06-26 03:53:03.602028":     0 features added
 "2018-06-26 0

 "2018-06-26 03:57:08.326905":     base features shape: (1565036, 17)
 "2018-06-26 03:57:08.359354":     0 features added
 "2018-06-26 03:57:08.359424":     base features included
 "2018-06-26 03:57:08.359541":     result shape: (1565036, 17)
 "2018-06-26 03:57:08.361489": days to predict: [3]
 "2018-06-26 03:57:08.361546": loaded model from: tmp_220765/tmp_model_trained_on_2
 "2018-06-26 03:57:08.361594": features shape: (1565036, 17)
 "2018-06-26 03:57:08.361626": start predicting on day 3
 "2018-06-26 03:57:11.754588": predictions shape: (142276, 11, 2)
 "2018-06-26 03:57:11.754667": saveing results to res/train_2_test_3
 "2018-06-26 03:57:11.860057": results saved
 "2018-06-26 03:57:15.911167": preprocesing started
 "2018-06-26 03:57:17.564989":     base features shape: (142276, 17)
 "2018-06-26 03:57:17.566619":     0 features added
 "2018-06-26 03:57:17.566662":     base features included
 "2018-06-26 03:57:17.566747":     result shape: (142276, 17)
 "2018-06-26 03:57:17.568237":

 "2018-06-26 04:01:21.833027": days to predict: [3]
 "2018-06-26 04:01:21.833080": loaded model from: tmp_398423/tmp_model_trained_on_2
 "2018-06-26 04:01:21.833119": features shape: (1565036, 17)
 "2018-06-26 04:01:21.833149": start predicting on day 3
 "2018-06-26 04:01:25.263232": predictions shape: (142276, 11, 2)
 "2018-06-26 04:01:25.263308": saveing results to res/train_2_test_3
 "2018-06-26 04:01:25.365270": results saved


In [16]:
res

{0.1: (0.6169637698783023, 0.0016096873410240398),
 0.18888888888888888: (0.6157680892822627, 0.003071613690000961),
 0.2777777777777778: (0.6133279115775038, 0.0024073689462938085),
 0.3666666666666667: (0.6138452520576748, 0.0021336222982461872),
 0.4555555555555556: (0.6151715497373778, 0.0029970185879886033),
 0.5444444444444445: (0.6119849556958805, 0.0015555245902290058),
 0.6333333333333333: (0.6118932242323092, 0.0026777342111846823),
 0.7222222222222222: (0.6100954166703613, 0.0019636606242704408),
 0.8111111111111111: (0.6160171631774499, 0.001377470147915926),
 0.9: (0.6132512392187337, 0.00046216270137510417)}

In [None]:
get_greedy_metric(0, average_target, smart_positions)