In [1]:
import numpy as np
import shutil
import os
from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier
import sys

sys.path.append("../utils/")
from constants import POSITIONS_VARIANTS
sys.path.append("../calculating_predictions")
from calculate_predictions import calculate_predictions
sys.path.append("../evaluation/")
from evaluate import evaluate
from run_evaluation import *

%reload_ext autoreload
%autoreload 2

In [6]:
def greedy_positions(predictions):
    predictions = np.argmax(predictions, axis=2)
    def get_first_one(prediction):
        for i, item in enumerate(prediction):
            if item == 1:
                return i if i != 10 else 100
        return 100
    return np.array([
        get_first_one(prediction)
        for prediction in predictions
    ])


def smart_positions(predictions):
    predictions = predictions[:, :, 1]
    return np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]


def smart_positions_with_threshold(predictions, threshold):
    predictions = predictions[:, :, 1]
    positions = np.array(POSITIONS_VARIANTS)[np.argmax(predictions, axis=1)]
    positions[np.argmax(predictions, axis=1) < threshold] = 100
    return positions


def get_greedy_metric(i, threshold, positions):
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = -1
    args.data_folder = "../../../data/best_features_days_data/"
    args.out_folder = "res"
    args.type = "binary_classification"
    args.need_position_feature = True
    args.labels_to_substruct = None
    args.metric_folder = "metrics"
    args.model_constructor = lambda verbose: CatBoostClassifier(verbose=False)
    args.additional_features = {
        day: [
            os.path.join("features_models_with_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_with_pos", str(i)))
        ]
        for day in [3, 6]
    }
    args.train_days = [3]
    args.validation_day = None
    args.test_days = [6]
    args.threshold = threshold
    
    def clear(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, positions)
    
    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
%%time
from pool_iterator import pool_iterator
targets_sum = 0
targets_count = 0
for item in pool_iterator("../../../data/pool.json"):
    targets_count += 1
    targets_sum += item["target"]
average_target = targets_sum / targets_count

CPU times: user 44.8 s, sys: 719 ms, total: 45.6 s
Wall time: 45.6 s


In [4]:
average_target

0.6187247619522409

In [7]:
metrics = [
    get_greedy_metric(
        i,
        average_target,
        lambda predictions: smart_positions(predictions))
    for i in range(15)
]

"2018-06-21 10:53:25.383599": preprocesing started
"2018-06-21 10:53:26.146802":     base features shape: (20203, 80)
"2018-06-21 10:53:27.275424":     28 features added
"2018-06-21 10:53:27.275526":     result shape: (20203, 108)
"2018-06-21 10:53:27.279232": train features shape: (20203, 108)
"2018-06-21 10:53:27.850549": preprocesing finished
"2018-06-21 10:53:27.850648": start training on days [3]
"2018-06-21 10:53:27.850810": using fit without validation
"2018-06-21 10:53:54.650918": built 1000 trees
"2018-06-21 10:53:56.735877":     base features shape: (247423, 80)
"2018-06-21 10:54:01.259355":     28 features added
"2018-06-21 10:54:01.259487":     result shape: (247423, 108)
"2018-06-21 10:54:01.260051": days to predict: [6]
"2018-06-21 10:54:01.260140": features shape: (247423, 108)
"2018-06-21 10:54:01.260339": start predicting on day 6
"2018-06-21 10:54:03.034795": predictions shape: (22493, 11, 2)
"2018-06-21 10:54:03.034940": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 10:58:46.550025": preprocesing started
"2018-06-21 10:58:47.276820":     base features shape: (20203, 80)
"2018-06-21 10:58:47.733820":     28 features added
"2018-06-21 10:58:47.733929":     result shape: (20203, 108)
"2018-06-21 10:58:47.735441": train features shape: (20203, 108)
"2018-06-21 10:58:48.368401": preprocesing finished
"2018-06-21 10:58:48.368475": start training on days [3]
"2018-06-21 10:58:48.368574": using fit without validation
"2018-06-21 10:59:17.695847": built 1000 trees
"2018-06-21 10:59:19.766443":     base features shape: (247423, 80)
"2018-06-21 10:59:24.450973":     28 features added
"2018-06-21 10:59:24.451091":     result shape: (247423, 108)
"2018-06-21 10:59:24.453744": days to predict: [6]
"2018-06-21 10:59:24.453901": features shape: (247423, 108)
"2018-06-21 10:59:24.453985": start predicting on day 6
"2018-06-21 10:59:26.509985": predictions shape: (22493, 11, 2)
"2018-06-21 10:59:26.510053": saveing results to res/train_3_test_6
"2018-06

In [8]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6290778984929912, 0.00376453024270512)

In [9]:
res = {}
for th in np.linspace(0.1, 0.9, 10):
    metrics = [
        get_greedy_metric(
            i,
            average_target,
            lambda predictions: smart_positions_with_threshold(predictions, th))
        for i in range(5)
    ]
    res[th] = (np.mean(metrics), np.std(metrics) / np.sqrt(5))

"2018-06-21 11:03:28.845038": preprocesing started
"2018-06-21 11:03:29.597109":     base features shape: (20203, 80)
"2018-06-21 11:03:30.068041":     28 features added
"2018-06-21 11:03:30.068132":     result shape: (20203, 108)
"2018-06-21 11:03:30.069660": train features shape: (20203, 108)
"2018-06-21 11:03:30.643728": preprocesing finished
"2018-06-21 11:03:30.643827": start training on days [3]
"2018-06-21 11:03:30.643979": using fit without validation
"2018-06-21 11:03:56.864927": built 1000 trees
"2018-06-21 11:03:58.978888":     base features shape: (247423, 80)
"2018-06-21 11:04:03.570900":     28 features added
"2018-06-21 11:04:03.571030":     result shape: (247423, 108)
"2018-06-21 11:04:03.573859": days to predict: [6]
"2018-06-21 11:04:03.574014": features shape: (247423, 108)
"2018-06-21 11:04:03.574032": start predicting on day 6
"2018-06-21 11:04:05.362599": predictions shape: (22493, 11, 2)
"2018-06-21 11:04:05.362678": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 11:08:46.886105": preprocesing started
"2018-06-21 11:08:47.607207":     base features shape: (20203, 80)
"2018-06-21 11:08:48.083674":     28 features added
"2018-06-21 11:08:48.083769":     result shape: (20203, 108)
"2018-06-21 11:08:48.085243": train features shape: (20203, 108)
"2018-06-21 11:08:48.692865": preprocesing finished
"2018-06-21 11:08:48.692933": start training on days [3]
"2018-06-21 11:08:48.692950": using fit without validation
"2018-06-21 11:09:15.133859": built 1000 trees
"2018-06-21 11:09:17.118259":     base features shape: (247423, 80)
"2018-06-21 11:09:21.500043":     28 features added
"2018-06-21 11:09:21.500141":     result shape: (247423, 108)
"2018-06-21 11:09:21.502847": days to predict: [6]
"2018-06-21 11:09:21.502887": features shape: (247423, 108)
"2018-06-21 11:09:21.502902": start predicting on day 6
"2018-06-21 11:09:23.227038": predictions shape: (22493, 11, 2)
"2018-06-21 11:09:23.227114": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 11:14:09.625378": preprocesing started
"2018-06-21 11:14:10.423558":     base features shape: (20203, 80)
"2018-06-21 11:14:11.065518":     28 features added
"2018-06-21 11:14:11.065624":     result shape: (20203, 108)
"2018-06-21 11:14:11.067487": train features shape: (20203, 108)
"2018-06-21 11:14:11.697518": preprocesing finished
"2018-06-21 11:14:11.697616": start training on days [3]
"2018-06-21 11:14:11.697665": using fit without validation
"2018-06-21 11:14:41.569311": built 1000 trees
"2018-06-21 11:14:43.699739":     base features shape: (247423, 80)
"2018-06-21 11:14:49.055691":     28 features added
"2018-06-21 11:14:49.055782":     result shape: (247423, 108)
"2018-06-21 11:14:49.056295": days to predict: [6]
"2018-06-21 11:14:49.056396": features shape: (247423, 108)
"2018-06-21 11:14:49.056415": start predicting on day 6
"2018-06-21 11:14:50.936180": predictions shape: (22493, 11, 2)
"2018-06-21 11:14:50.936314": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 11:19:37.709466": preprocesing started
"2018-06-21 11:19:38.431218":     base features shape: (20203, 80)
"2018-06-21 11:19:38.923862":     28 features added
"2018-06-21 11:19:38.923981":     result shape: (20203, 108)
"2018-06-21 11:19:38.925339": train features shape: (20203, 108)
"2018-06-21 11:19:39.512226": preprocesing finished
"2018-06-21 11:19:39.512294": start training on days [3]
"2018-06-21 11:19:39.512309": using fit without validation
"2018-06-21 11:20:06.626396": built 1000 trees
"2018-06-21 11:20:08.623026":     base features shape: (247423, 80)
"2018-06-21 11:20:13.764147":     28 features added
"2018-06-21 11:20:13.764242":     result shape: (247423, 108)
"2018-06-21 11:20:13.767123": days to predict: [6]
"2018-06-21 11:20:13.767175": features shape: (247423, 108)
"2018-06-21 11:20:13.767391": start predicting on day 6
"2018-06-21 11:20:15.660315": predictions shape: (22493, 11, 2)
"2018-06-21 11:20:15.660395": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 11:25:11.999725": preprocesing started
"2018-06-21 11:25:12.727878":     base features shape: (20203, 80)
"2018-06-21 11:25:13.234945":     28 features added
"2018-06-21 11:25:13.235036":     result shape: (20203, 108)
"2018-06-21 11:25:13.236505": train features shape: (20203, 108)
"2018-06-21 11:25:13.810320": preprocesing finished
"2018-06-21 11:25:13.810419": start training on days [3]
"2018-06-21 11:25:13.810572": using fit without validation
"2018-06-21 11:25:43.154625": built 1000 trees
"2018-06-21 11:25:45.329240":     base features shape: (247423, 80)
"2018-06-21 11:25:50.517993":     28 features added
"2018-06-21 11:25:50.518109":     result shape: (247423, 108)
"2018-06-21 11:25:50.521152": days to predict: [6]
"2018-06-21 11:25:50.521389": features shape: (247423, 108)
"2018-06-21 11:25:50.521442": start predicting on day 6
"2018-06-21 11:25:52.393765": predictions shape: (22493, 11, 2)
"2018-06-21 11:25:52.393848": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 11:30:50.853454": preprocesing started
"2018-06-21 11:30:51.624421":     base features shape: (20203, 80)
"2018-06-21 11:30:52.129427":     28 features added
"2018-06-21 11:30:52.129569":     result shape: (20203, 108)
"2018-06-21 11:30:52.131542": train features shape: (20203, 108)
"2018-06-21 11:30:52.739263": preprocesing finished
"2018-06-21 11:30:52.739364": start training on days [3]
"2018-06-21 11:30:52.739463": using fit without validation
"2018-06-21 11:31:22.680899": built 1000 trees
"2018-06-21 11:31:24.976991":     base features shape: (247423, 80)
"2018-06-21 11:31:29.737999":     28 features added
"2018-06-21 11:31:29.738093":     result shape: (247423, 108)
"2018-06-21 11:31:29.739965": days to predict: [6]
"2018-06-21 11:31:29.739999": features shape: (247423, 108)
"2018-06-21 11:31:29.740109": start predicting on day 6
"2018-06-21 11:31:31.769247": predictions shape: (22493, 11, 2)
"2018-06-21 11:31:31.769315": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 11:36:31.263700": preprocesing started
"2018-06-21 11:36:32.022117":     base features shape: (20203, 80)
"2018-06-21 11:36:32.532119":     28 features added
"2018-06-21 11:36:32.532246":     result shape: (20203, 108)
"2018-06-21 11:36:32.533680": train features shape: (20203, 108)
"2018-06-21 11:36:33.112943": preprocesing finished
"2018-06-21 11:36:33.113039": start training on days [3]
"2018-06-21 11:36:33.113057": using fit without validation
"2018-06-21 11:37:00.671005": built 1000 trees
"2018-06-21 11:37:02.818853":     base features shape: (247423, 80)
"2018-06-21 11:37:07.692629":     28 features added
"2018-06-21 11:37:07.692729":     result shape: (247423, 108)
"2018-06-21 11:37:07.698664": days to predict: [6]
"2018-06-21 11:37:07.698822": features shape: (247423, 108)
"2018-06-21 11:37:07.698881": start predicting on day 6
"2018-06-21 11:37:09.663789": predictions shape: (22493, 11, 2)
"2018-06-21 11:37:09.663886": saveing results to res/train_3_test_6
"2018-06

In [10]:
res

{0.1: (0.6394855169420053, 0.006355355560028233),
 0.18888888888888888: (0.6324336954197758, 0.0038223396954941793),
 0.2777777777777778: (0.624825596954374, 0.006744816715426257),
 0.3666666666666667: (0.6329471055489752, 0.0036222230359973144),
 0.4555555555555556: (0.6280418908548082, 0.0031158803000135376),
 0.5444444444444445: (0.620930375185071, 0.0064114734396999995),
 0.6333333333333333: (0.6265237116676214, 0.006023386620660801),
 0.7222222222222222: (0.6290677620762715, 0.0035559762985195),
 0.8111111111111111: (0.6271564049088594, 0.004010699051181721),
 0.9: (0.6426601288422609, 0.00451650280938653)}