In [1]:
import numpy as np
import shutil
import os

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *
from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier

%reload_ext autoreload
%autoreload 2

In [2]:
def clear(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)

args = Namespace()
args.verbose = True
args.first_feature = 0
args.last_feature = -1
args.data_folder = "../../../data/new_days_data_best_features/"
out_folder = "regression_predictions"
args.type = "regression"
args.need_position_feature = True
args.labels_to_substruct = None
args.metric_folder = "metrics"

def get_regression_stacking_metric(i):
    args.model_constructor = lambda verbose: CatBoostRegressor(verbose=False)
    args.additional_features = {
        day: [
            os.path.join("features_models_with_double_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_with_double_pos", str(i)))
        ]
        for day in [3, 6]
    }
    args.validation_day = None
    args.train_days = [3]
    args.test_days = [6]
    
    args.out_folder = os.path.join(out_folder, str(i))

    clear(args.out_folder)
    clear(args.metric_folder)
    
    calculate_predictions(args)
    evaluate(
        args.out_folder,
        args.data_folder,
        args.metric_folder,
        argmax_positions,
        verbose=args.verbose
    )
    
    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
clear("regression_predictions")

In [4]:
%%time
args.add_base_features = False
metrics = [get_regression_stacking_metric(i) for i in range(3)]

"2018-06-21 19:45:46.706814": preprocesing started
"2018-06-21 19:45:51.966597":     base features shape: (142298, 129)
"2018-06-21 19:46:01.428115":     28 features added
"2018-06-21 19:46:01.428337":     base features are not included
"2018-06-21 19:46:01.428477":     result shape: (142298, 28)
"2018-06-21 19:46:01.432698": train features shape: (142298, 28)
"2018-06-21 19:46:04.762054": preprocesing finished
"2018-06-21 19:46:04.762149": start training on days [3]
"2018-06-21 19:46:04.762175": using fit without validation
"2018-06-21 19:46:33.029647": built 1000 trees
"2018-06-21 19:46:53.807271":     base features shape: (1565278, 129)
"2018-06-21 19:47:50.387774":     28 features added
"2018-06-21 19:47:50.387877":     base features are not included
"2018-06-21 19:47:50.388041":     result shape: (1565278, 28)
"2018-06-21 19:47:50.446373": days to predict: [6]
"2018-06-21 19:47:50.446529": features shape: (1565278, 28)
"2018-06-21 19:47:50.446614": start predicting on day 6
"2018-

In [5]:
np.mean(metrics), np.std(metrics) / np.sqrt(3)

(0.6110096458857548, 0.003096590463097724)

In [6]:
x = np.load("regression_predictions/0/train_3_test_6.npy")

In [7]:
x

array([[0.7327358 , 0.74666235, 0.76281743, ..., 0.75523406, 0.77205019,
        0.7408064 ],
       [0.90794847, 0.90400326, 0.87659188, ..., 0.86470297, 0.87037088,
        0.83443899],
       [0.56054619, 0.57023459, 0.56721917, ..., 0.56735336, 0.58177867,
        0.57183671],
       ...,
       [0.66126011, 0.66018256, 0.65404777, ..., 0.66358035, 0.66192092,
        0.69124051],
       [0.61031019, 0.62050464, 0.62183348, ..., 0.63932921, 0.6419518 ,
        0.64956151],
       [0.76533276, 0.78057517, 0.78178022, ..., 0.78897559, 0.81483373,
        0.77021311]])

In [8]:
from collections import Counter
Counter(np.argmax(x, axis=1))

Counter({9: 17076,
         0: 17393,
         4: 19018,
         7: 5654,
         5: 4940,
         10: 42018,
         2: 7451,
         1: 8778,
         3: 6871,
         6: 5545,
         8: 7554})

In [9]:
%%time
args.add_base_features = True
metrics = [get_regression_stacking_metric(i) for i in range(3)]

"2018-06-21 19:52:56.880167": preprocesing started
"2018-06-21 19:53:01.739304":     base features shape: (142298, 129)
"2018-06-21 19:53:07.674397":     28 features added
"2018-06-21 19:53:07.674476":     base features not included
"2018-06-21 19:53:07.674603":     result shape: (142298, 157)
"2018-06-21 19:53:07.689131": train features shape: (142298, 157)
"2018-06-21 19:53:10.780403": preprocesing finished
"2018-06-21 19:53:10.780499": start training on days [3]
"2018-06-21 19:53:10.780611": using fit without validation
"2018-06-21 19:54:40.104780": built 1000 trees
"2018-06-21 19:55:00.701501":     base features shape: (1565278, 129)
"2018-06-21 19:55:58.923520":     28 features added
"2018-06-21 19:55:58.923617":     base features not included
"2018-06-21 19:55:58.923656":     result shape: (1565278, 157)
"2018-06-21 19:55:58.991771": days to predict: [6]
"2018-06-21 19:55:58.991845": features shape: (1565278, 157)
"2018-06-21 19:55:58.991869": start predicting on day 6
"2018-06-2

In [10]:
np.mean(metrics), np.std(metrics) / np.sqrt(3)

(0.6169320462145124, 0.002248081820030615)

In [11]:
x = np.load("regression_predictions/0/train_3_test_6.npy")

In [12]:
x

array([[0.75252491, 0.759719  , 0.75052672, ..., 0.74669501, 0.75526107,
        0.74543769],
       [0.82410894, 0.803911  , 0.80899186, ..., 0.79466424, 0.80096013,
        0.79083266],
       [0.5481255 , 0.55670771, 0.55041139, ..., 0.55723459, 0.57044716,
        0.56498824],
       ...,
       [0.66695584, 0.67313447, 0.67066288, ..., 0.67483936, 0.67366297,
        0.67252227],
       [0.60318267, 0.60992946, 0.60949032, ..., 0.61466861, 0.61510167,
        0.6107557 ],
       [0.76372882, 0.77123028, 0.77096348, ..., 0.77314285, 0.79407453,
        0.77207266]])

In [13]:
from collections import Counter
Counter(np.argmax(x, axis=1))

Counter({1: 8825,
         0: 7741,
         4: 53383,
         10: 31330,
         5: 3117,
         9: 15275,
         3: 4717,
         2: 4793,
         6: 4167,
         7: 3992,
         8: 4958})