In [1]:
import numpy as np
import shutil
import os

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *
from argparse import Namespace
from catboost import CatBoostRegressor
from constants import FEATURES_NUMBER

%reload_ext autoreload
%autoreload 2

In [20]:
def get_baseline_metric():
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = FEATURES_NUMBER
    args.data_folder = "../../../data/new_days_data_best_features"
    args.out_folder = "res"
    args.type = "regression"
    args.position_features_num = 2
    args.labels_to_substruct = None
    args.metric_folder = "metrics"
    args.model_constructor = lambda verbose: CatBoostRegressor(verbose=False)
    args.additional_features = None
    args.add_base_features = True
    args.train_days = [2]
    args.validation_day = None
    args.test_days = [3, 6]
    
    def clear(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
    
    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, argmax_positions)
    
    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
%%time
metrics = [get_baseline_metric() for i in range(10)]

"2018-06-22 19:30:40.047630": preprocesing started
"2018-06-22 19:30:44.865113":     base features shape: (142298, 129)
"2018-06-22 19:30:44.887234":     0 features added
"2018-06-22 19:30:44.887290":     base features not included
"2018-06-22 19:30:44.887310":     result shape: (142298, 129)
"2018-06-22 19:30:44.909677": train features shape: (142298, 129)
"2018-06-22 19:30:47.988030": preprocesing finished
"2018-06-22 19:30:47.988144": start training on days [3]
"2018-06-22 19:30:47.988176": using fit without validation
"2018-06-22 19:31:57.334571": built 1000 trees
"2018-06-22 19:31:57.334740": saving model in path "tmp_819962/tmp_model_trained_on_3"
"2018-06-22 19:32:16.012603":     base features shape: (1565278, 129)
"2018-06-22 19:32:16.270173":     0 features added
"2018-06-22 19:32:16.270256":     base features not included
"2018-06-22 19:32:16.270281":     result shape: (1565278, 129)
"2018-06-22 19:32:16.275095": days to predict: [6]
"2018-06-22 19:32:16.275257": loaded model

"2018-06-22 19:42:43.731183": preprocesing finished
"2018-06-22 19:42:43.731260": start training on days [3]
"2018-06-22 19:42:43.731276": using fit without validation
"2018-06-22 19:43:52.735193": built 1000 trees
"2018-06-22 19:43:52.735297": saving model in path "tmp_876525/tmp_model_trained_on_3"
"2018-06-22 19:44:10.335462":     base features shape: (1565278, 129)
"2018-06-22 19:44:10.573837":     0 features added
"2018-06-22 19:44:10.573910":     base features not included
"2018-06-22 19:44:10.573932":     result shape: (1565278, 129)
"2018-06-22 19:44:10.578051": days to predict: [6]
"2018-06-22 19:44:10.578106": loaded model from: tmp_876525/tmp_model_trained_on_3
"2018-06-22 19:44:10.578126": features shape: (1565278, 129)
"2018-06-22 19:44:10.578143": start predicting on day 6
"2018-06-22 19:44:24.098105": predictions shape: (142298, 11)
"2018-06-22 19:44:24.098284": saveing results to res/train_3_test_6
"2018-06-22 19:44:24.106316": results saved
"2018-06-22 19:44:32.937495"

In [5]:
print(metrics)
print(np.mean(metrics))
print(np.std(metrics) / np.sqrt(10))

[0.6171951279776535, 0.6052403451762278, 0.6135798226970322, 0.602767283139525, 0.6159323803357952, 0.6151203880760485, 0.6195370765654714, 0.6117425730490287, 0.6176295581403932, 0.6028324634437726]
0.6121577018600949
0.0018920422026448597


In [6]:
x = np.load("res/train_3_test_6.npy")

In [7]:
x

array([[0.82632447, 0.82959779, 0.82983779, ..., 0.82990565, 0.82990565,
        0.8297502 ],
       [0.71998153, 0.71871054, 0.71895054, ..., 0.7190184 , 0.7190184 ,
        0.71687679],
       [0.52731602, 0.53249977, 0.53273977, ..., 0.5327747 , 0.5327747 ,
        0.53261925],
       ...,
       [0.67903581, 0.68255032, 0.68279031, ..., 0.68302456, 0.68302456,
        0.68286911],
       [0.65738812, 0.66090263, 0.66114263, ..., 0.66132716, 0.66132716,
        0.66241094],
       [0.73025359, 0.73348498, 0.73372498, ..., 0.7339095 , 0.7339095 ,
        0.73375405]])

In [10]:
%%time
metrics = [get_baseline_metric() for i in range(10)]

"2018-06-22 19:52:57.139559": preprocesing started
"2018-06-22 19:53:01.748350":     base features shape: (142298, 130)
"2018-06-22 19:53:01.761046":     0 features added
"2018-06-22 19:53:01.761107":     base features not included
"2018-06-22 19:53:01.761254":     result shape: (142298, 130)
"2018-06-22 19:53:01.773297": train features shape: (142298, 130)
"2018-06-22 19:53:04.893488": preprocesing finished
"2018-06-22 19:53:04.893572": start training on days [3]
"2018-06-22 19:53:04.893673": using fit without validation
"2018-06-22 19:54:21.885890": built 1000 trees
"2018-06-22 19:54:21.886001": saving model in path "tmp_69994/tmp_model_trained_on_3"
"2018-06-22 19:54:40.567241":     base features shape: (1565278, 130)
"2018-06-22 19:54:40.819629":     0 features added
"2018-06-22 19:54:40.819705":     base features not included
"2018-06-22 19:54:40.819729":     result shape: (1565278, 130)
"2018-06-22 19:54:40.823979": days to predict: [6]
"2018-06-22 19:54:40.824220": loaded model 

"2018-06-22 20:05:12.719782": preprocesing finished
"2018-06-22 20:05:12.719892": start training on days [3]
"2018-06-22 20:05:12.719910": using fit without validation
"2018-06-22 20:06:22.209992": built 1000 trees
"2018-06-22 20:06:22.210088": saving model in path "tmp_278659/tmp_model_trained_on_3"
"2018-06-22 20:06:39.969935":     base features shape: (1565278, 130)
"2018-06-22 20:06:40.210719":     0 features added
"2018-06-22 20:06:40.210830":     base features not included
"2018-06-22 20:06:40.210852":     result shape: (1565278, 130)
"2018-06-22 20:06:40.215076": days to predict: [6]
"2018-06-22 20:06:40.215210": loaded model from: tmp_278659/tmp_model_trained_on_3
"2018-06-22 20:06:40.215295": features shape: (1565278, 130)
"2018-06-22 20:06:40.215350": start predicting on day 6
"2018-06-22 20:06:53.942077": predictions shape: (142298, 11)
"2018-06-22 20:06:53.942147": saveing results to res/train_3_test_6
"2018-06-22 20:06:53.950368": results saved
"2018-06-22 20:07:02.820219"

In [11]:
print(metrics)
print(np.mean(metrics))
print(np.std(metrics) / np.sqrt(10))

[0.617366604427115, 0.6210247440318585, 0.6201406122976244, 0.6191444372160907, 0.6180949507470894, 0.619210432068342, 0.6166916296417266, 0.6136196499287719, 0.6253125652735294, 0.6257398689194367]
0.6196345494551585
0.0011156941681921139


In [12]:
y = np.load("res/train_3_test_6.npy")

In [13]:
y

array([[0.79992477, 0.81497371, 0.8151054 , ..., 0.81367537, 0.81397214,
        0.80905637],
       [0.73954984, 0.74204288, 0.74217457, ..., 0.74173845, 0.74262722,
        0.73778534],
       [0.50757537, 0.51495255, 0.51508425, ..., 0.51514589, 0.51604789,
        0.51301063],
       ...,
       [0.68153621, 0.692112  , 0.69279845, ..., 0.69230533, 0.69460895,
        0.690685  ],
       [0.65332973, 0.66012121, 0.66000773, ..., 0.659827  , 0.66163067,
        0.65797779],
       [0.72851919, 0.73570857, 0.73559508, ..., 0.73527473, 0.7350019 ,
        0.73342544]])

In [14]:
np.save("predictions/single_pos", x)
np.save("predictions/double_pos", y)

In [21]:
metric = get_baseline_metric()

"2018-06-22 20:32:45.006932": preprocesing started
"2018-06-22 20:32:49.948832":     base features shape: (142298, 130)
"2018-06-22 20:32:49.961571":     0 features added
"2018-06-22 20:32:49.961634":     base features not included
"2018-06-22 20:32:49.961658":     result shape: (142298, 130)
"2018-06-22 20:32:49.973659": train features shape: (142298, 130)
"2018-06-22 20:32:53.052009": preprocesing finished
"2018-06-22 20:32:53.052108": start training on days [2]
"2018-06-22 20:32:53.052268": using fit without validation
"2018-06-22 20:34:01.278266": built 1000 trees
"2018-06-22 20:34:01.278356": saving model in path "tmp_690415/tmp_model_trained_on_2"
"2018-06-22 20:34:18.987329":     base features shape: (1565278, 130)
"2018-06-22 20:34:19.239780":     0 features added
"2018-06-22 20:34:19.239870":     base features not included
"2018-06-22 20:34:19.240370":     result shape: (1565278, 130)
"2018-06-22 20:34:36.202310":     base features shape: (1565278, 130)
"2018-06-22 20:34:36.43

In [22]:
tmp3 = np.load("res/train_2_test_3.npy")
tmp6 = np.load("res/train_2_test_6.npy")
np.save("predictions/double_pos_3", tmp3)
np.save("predictions/double_pos_6", tmp6)