In [1]:
import numpy as np
import shutil
import os

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *
from argparse import Namespace
from catboost import CatBoostRegressor

%reload_ext autoreload
%autoreload 2

In [2]:
def get_baseline_metric():
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = -1
    args.data_folder = "../../../data/new_days_data_best_features"
    args.out_folder = "res"
    args.type = "regression"
    args.need_position_feature = True
    args.labels_to_substruct = None
    args.metric_folder = "metrics"
    args.model_constructor = lambda verbose: CatBoostRegressor(verbose=False)
    args.additional_features = None
    args.add_base_features = True
    args.train_days = [3]
    args.validation_day = None
    args.test_days = [6]
    
    def clear(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, argmax_positions)
    
    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [4]:
%%time
metrics = [get_baseline_metric() for i in range(10)]

"2018-06-21 18:59:17.713411": preprocesing started
"2018-06-21 18:59:23.092224":     base features shape: (142298, 129)
"2018-06-21 18:59:23.123102":     0 features added
"2018-06-21 18:59:23.123202":     base features not included
"2018-06-21 18:59:23.123352":     result shape: (142298, 129)
"2018-06-21 18:59:23.151133": train features shape: (142298, 129)
"2018-06-21 18:59:26.493187": preprocesing finished
"2018-06-21 18:59:26.493289": start training on days [3]
"2018-06-21 18:59:26.493317": using fit without validation
"2018-06-21 19:00:49.373101": built 1000 trees
"2018-06-21 19:01:08.955206":     base features shape: (1565278, 129)
"2018-06-21 19:01:09.599343":     0 features added
"2018-06-21 19:01:09.599424":     base features not included
"2018-06-21 19:01:09.599659":     result shape: (1565278, 129)
"2018-06-21 19:01:09.633277": days to predict: [6]
"2018-06-21 19:01:09.633427": features shape: (1565278, 129)
"2018-06-21 19:01:09.633511": start predicting on day 6
"2018-06-21 

"2018-06-21 19:15:04.280944": preprocesing finished
"2018-06-21 19:15:04.281046": start training on days [3]
"2018-06-21 19:15:04.281217": using fit without validation
"2018-06-21 19:16:23.745924": built 1000 trees
"2018-06-21 19:16:43.926298":     base features shape: (1565278, 129)
"2018-06-21 19:16:44.592875":     0 features added
"2018-06-21 19:16:44.592969":     base features not included
"2018-06-21 19:16:44.593089":     result shape: (1565278, 129)
"2018-06-21 19:16:44.627302": days to predict: [6]
"2018-06-21 19:16:44.627389": features shape: (1565278, 129)
"2018-06-21 19:16:44.627443": start predicting on day 6
"2018-06-21 19:16:58.870396": predictions shape: (142298, 11)
"2018-06-21 19:16:58.870462": saveing results to res/train_3_test_6
"2018-06-21 19:16:58.878675": results saved
"2018-06-21 19:17:08.587881": preprocesing started
"2018-06-21 19:17:13.450621":     base features shape: (142298, 129)
"2018-06-21 19:17:13.464609":     0 features added
"2018-06-21 19:17:13.464686

In [5]:
print(metrics)
print(np.mean(metrics))
print(np.std(metrics) / np.sqrt(10))

[0.620166229080817, 0.6280713781472671, 0.6177104194402271, 0.6194118966362879, 0.6203237210919406, 0.6252721195126272, 0.6186123034567165, 0.6180543019140391, 0.6259028030193531, 0.627145007268213]
0.6220670179567489
0.0012146723190276977


In [6]:
x = np.load("res/train_3_test_6.npy")

In [7]:
x

array([[0.80860991, 0.81449919, 0.81474932, ..., 0.81365689, 0.81416775,
        0.81146317],
       [0.79542404, 0.80147687, 0.8012288 , ..., 0.79878172, 0.80063883,
        0.79449726],
       [0.51033806, 0.51864286, 0.51889299, ..., 0.51931345, 0.51905381,
        0.51559963],
       ...,
       [0.67678093, 0.68261088, 0.682861  , ..., 0.68339279, 0.68340295,
        0.68188452],
       [0.65341639, 0.65908118, 0.65861657, ..., 0.65999   , 0.65903578,
        0.65864688],
       [0.74294024, 0.74832096, 0.74857108, ..., 0.74807761, 0.74858847,
        0.74639247]])

In [8]:
from collections import Counter
Counter(np.argmax(x, axis=1))

Counter({4: 94707,
         5: 10265,
         3: 7997,
         7: 10477,
         2: 1780,
         10: 1449,
         9: 7599,
         1: 1811,
         8: 3320,
         0: 2893})