In [1]:
import numpy as np
import shutil
import os
from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier
import sys

sys.path.append("../utils/")
from constants import POSITIONS_VARIANTS
sys.path.append("../calculating_predictions")
from calculate_predictions import calculate_predictions
sys.path.append("../evaluation/")
from evaluate import evaluate
from run_evaluation import *

FEATURES_NUMBER = 16

%reload_ext autoreload
%autoreload 2

In [2]:
def greedy_positions(predictions):
    predictions = np.argmax(predictions, axis=2)
    def get_first_one(prediction):
        for i, item in enumerate(prediction):
            if item == 1:
                return i if i != 10 else 100
        return 100
    return np.array([
        get_first_one(prediction)
        for prediction in predictions
    ])


def smart_positions(predictions):
    predictions = predictions[:, :, 1]
    return np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]


def smart_positions_with_threshold(predictions, threshold):
    predictions = predictions[:, :, 1]
    positions = np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]
    positions[np.argmax(predictions, axis=1) < threshold] = 8
    return positions

def get_greedy_metric(i, threshold, positions, stacking=False, add_base_features=True):
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = FEATURES_NUMBER
    args.data_folder = "../../../data/pool_with_queries/train_test_split/"
    args.out_folder = "res"
    args.type = "binary_classification"
    args.labels_to_substruct = None
    args.position_features_num = 1
    args.metric_folder = "metrics1"
    args.model_constructor = lambda verbose: CatBoostClassifier(verbose=False)
    args.additional_features = {
        day: [
            os.path.join("features_models_single_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_single_pos", str(i)))
        ]
        for day in [2, 3]
    }
    if not stacking:
        args.additional_features = None
    args.add_base_features = add_base_features
    args.train_days = [2]
    args.validation_day = None
    args.test_days = [3]
    args.threshold = threshold
    
    def clear(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, positions)
    
    with open("metrics1/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
average_target = 0.6118888430078808

In [5]:
metrics = [get_greedy_metric(i, average_target, greedy_positions) for i in range(10)]

 "2018-06-26 02:44:32.037557": preprocesing started
 "2018-06-26 02:44:33.810955":     base features shape: (142276, 17)
 "2018-06-26 02:44:33.814602":     0 features added
 "2018-06-26 02:44:33.814648":     base features included
 "2018-06-26 02:44:33.814669":     result shape: (142276, 17)
 "2018-06-26 02:44:33.818203": train features shape: (142276, 17)
 "2018-06-26 02:44:35.266532": preprocesing finished
 "2018-06-26 02:44:35.266625": start training on days [2]
 "2018-06-26 02:44:35.266660": using fit without validation
 "2018-06-26 02:45:04.646797": built 1000 trees
 "2018-06-26 02:45:04.646917": saving model in path "tmp_815901/tmp_model_trained_on_2"
 "2018-06-26 02:45:09.171341":     base features shape: (1565036, 17)
 "2018-06-26 02:45:09.202142":     0 features added
 "2018-06-26 02:45:09.202209":     base features included
 "2018-06-26 02:45:09.202232":     result shape: (1565036, 17)
 "2018-06-26 02:45:09.204260": days to predict: [3]
 "2018-06-26 02:45:09.204315": loaded m

 "2018-06-26 02:49:08.145068": preprocesing finished
 "2018-06-26 02:49:08.145169": start training on days [2]
 "2018-06-26 02:49:08.145190": using fit without validation
 "2018-06-26 02:49:35.994919": built 1000 trees
 "2018-06-26 02:49:35.995028": saving model in path "tmp_538627/tmp_model_trained_on_2"
 "2018-06-26 02:49:40.532979":     base features shape: (1565036, 17)
 "2018-06-26 02:49:40.604992":     0 features added
 "2018-06-26 02:49:40.605061":     base features included
 "2018-06-26 02:49:40.605099":     result shape: (1565036, 17)
 "2018-06-26 02:49:40.607278": days to predict: [3]
 "2018-06-26 02:49:40.607373": loaded model from: tmp_538627/tmp_model_trained_on_2
 "2018-06-26 02:49:40.607529": features shape: (1565036, 17)
 "2018-06-26 02:49:40.607558": start predicting on day 3
 "2018-06-26 02:49:44.296621": predictions shape: (142276, 11, 2)
 "2018-06-26 02:49:44.296702": saveing results to res/train_2_test_3
 "2018-06-26 02:49:44.428716": results saved
 "2018-06-26 02:

In [6]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.5847823932243758, 0.0004937295397864847)

In [7]:
# stacking
metrics = [get_greedy_metric(i, average_target, smart_positions, True, True) for i in range(10)]

 "2018-06-26 02:52:04.353470": preprocesing started
 "2018-06-26 02:52:06.002040":     base features shape: (142276, 17)
 "2018-06-26 02:52:11.061065":     28 features added
 "2018-06-26 02:52:11.061156":     base features included
 "2018-06-26 02:52:11.061182":     result shape: (142276, 45)
 "2018-06-26 02:52:11.071936": train features shape: (142276, 45)
 "2018-06-26 02:52:12.525042": preprocesing finished
 "2018-06-26 02:52:12.525138": start training on days [2]
 "2018-06-26 02:52:12.525252": using fit without validation
 "2018-06-26 02:52:53.098403": built 1000 trees
 "2018-06-26 02:52:53.098513": saving model in path "tmp_585377/tmp_model_trained_on_2"
 "2018-06-26 02:52:57.838615":     base features shape: (1565036, 17)
 "2018-06-26 02:53:19.152400":     28 features added
 "2018-06-26 02:53:19.152499":     base features included
 "2018-06-26 02:53:19.152527":     result shape: (1565036, 45)
 "2018-06-26 02:53:19.225295": days to predict: [3]
 "2018-06-26 02:53:19.225390": loaded

 "2018-06-26 03:00:31.977658": preprocesing finished
 "2018-06-26 03:00:31.977743": start training on days [2]
 "2018-06-26 03:00:31.977766": using fit without validation
 "2018-06-26 03:01:09.304955": built 1000 trees
 "2018-06-26 03:01:09.305035": saving model in path "tmp_940047/tmp_model_trained_on_2"
 "2018-06-26 03:01:13.734839":     base features shape: (1565036, 17)
 "2018-06-26 03:01:35.031283":     28 features added
 "2018-06-26 03:01:35.031375":     base features included
 "2018-06-26 03:01:35.031403":     result shape: (1565036, 45)
 "2018-06-26 03:01:35.059516": days to predict: [3]
 "2018-06-26 03:01:35.059599": loaded model from: tmp_940047/tmp_model_trained_on_2
 "2018-06-26 03:01:35.059641": features shape: (1565036, 45)
 "2018-06-26 03:01:35.059750": start predicting on day 3
 "2018-06-26 03:01:41.376871": predictions shape: (142276, 11, 2)
 "2018-06-26 03:01:41.376949": saveing results to res/train_2_test_3
 "2018-06-26 03:01:41.480569": results saved
 "2018-06-26 03

In [8]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.6198781838638694, 0.0020780924744599905)

In [9]:
# stacking no base features
metrics = [get_greedy_metric(i, average_target, smart_positions, True, False) for i in range(10)]

 "2018-06-26 03:05:50.155438": preprocesing started
 "2018-06-26 03:05:51.907926":     base features shape: (142276, 17)
 "2018-06-26 03:05:55.802982":     28 features added
 "2018-06-26 03:05:55.803090":     base features not included
 "2018-06-26 03:05:55.803117":     result shape: (142276, 28)
 "2018-06-26 03:05:55.805736": train features shape: (142276, 28)
 "2018-06-26 03:05:57.123716": preprocesing finished
 "2018-06-26 03:05:57.123806": start training on days [2]
 "2018-06-26 03:05:57.123900": using fit without validation
 "2018-06-26 03:06:28.375398": built 1000 trees
 "2018-06-26 03:06:28.375474": saving model in path "tmp_190953/tmp_model_trained_on_2"
 "2018-06-26 03:06:33.026320":     base features shape: (1565036, 17)
 "2018-06-26 03:06:54.721091":     28 features added
 "2018-06-26 03:06:54.721200":     base features not included
 "2018-06-26 03:06:54.721419":     result shape: (1565036, 28)
 "2018-06-26 03:06:54.749580": days to predict: [3]
 "2018-06-26 03:06:54.749650"

 "2018-06-26 03:13:21.053515":     28 features added
 "2018-06-26 03:13:21.053614":     base features not included
 "2018-06-26 03:13:21.053644":     result shape: (142276, 28)
 "2018-06-26 03:13:21.068598": train features shape: (142276, 28)
 "2018-06-26 03:13:22.450476": preprocesing finished
 "2018-06-26 03:13:22.450600": start training on days [2]
 "2018-06-26 03:13:22.450697": using fit without validation
 "2018-06-26 03:13:55.668395": built 1000 trees
 "2018-06-26 03:13:55.668500": saving model in path "tmp_797809/tmp_model_trained_on_2"
 "2018-06-26 03:14:00.738413":     base features shape: (1565036, 17)
 "2018-06-26 03:14:22.026636":     28 features added
 "2018-06-26 03:14:22.026732":     base features not included
 "2018-06-26 03:14:22.026760":     result shape: (1565036, 28)
 "2018-06-26 03:14:22.061525": days to predict: [3]
 "2018-06-26 03:14:22.061712": loaded model from: tmp_797809/tmp_model_trained_on_2
 "2018-06-26 03:14:22.061815": features shape: (1565036, 28)
 "201

In [None]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

(0.6140630566710301, 0.0011831280032183562)

In [None]:
metrics = [get_greedy_metric(i, average_target, smart_positions) for i in range(10)]

 "2018-06-26 03:18:10.575876": preprocesing started
 "2018-06-26 03:18:12.245415":     base features shape: (142276, 17)
 "2018-06-26 03:18:12.247164":     0 features added
 "2018-06-26 03:18:12.247208":     base features included
 "2018-06-26 03:18:12.247295":     result shape: (142276, 17)
 "2018-06-26 03:18:12.248825": train features shape: (142276, 17)
 "2018-06-26 03:18:13.704663": preprocesing finished
 "2018-06-26 03:18:13.704759": start training on days [2]
 "2018-06-26 03:18:13.704775": using fit without validation
 "2018-06-26 03:18:42.651887": built 1000 trees
 "2018-06-26 03:18:42.651960": saving model in path "tmp_845797/tmp_model_trained_on_2"
 "2018-06-26 03:18:47.517582":     base features shape: (1565036, 17)
 "2018-06-26 03:18:47.553653":     0 features added
 "2018-06-26 03:18:47.553750":     base features included
 "2018-06-26 03:18:47.553888":     result shape: (1565036, 17)
 "2018-06-26 03:18:47.556929": days to predict: [3]
 "2018-06-26 03:18:47.557015": loaded m

 "2018-06-26 03:22:22.260546": preprocesing finished
 "2018-06-26 03:22:22.260632": start training on days [2]
 "2018-06-26 03:22:22.260739": using fit without validation


In [None]:
np.mean(metrics), np.std(metrics) / np.sqrt(10)

In [None]:
res = {}
for th in np.linspace(0.1, 0.9, 10):
    metrics = [
        get_greedy_metric(
            average_target,
            lambda predictions: smart_positions_with_threshold(predictions, th))
        for i in range(5)
    ]
    res[th] = (np.mean(metrics), np.std(metrics) / np.sqrt(5))

In [None]:
res

In [None]:
get_greedy_metric(0, average_target, smart_positions)