In [1]:
import numpy as np
import shutil
import os

from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lars, ElasticNet, Perceptron, SGDRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *

FEATURES_NUMBER = 79

%reload_ext autoreload
%autoreload 2

In [2]:
args = Namespace()
args.verbose = True
args.first_feature = 0
args.last_feature = -1
args.data_folder = "../../../data/best_features_days_data/"
out_folder = "substructed_target_predictions_with_stacked_features"
args.model_constructor=lambda verbose: CatBoostRegressor(verbose=False)
args.type = "regression"
args.metric_folder = "metrics"
args.train_days = [3]
args.validation_day = None
args.test_days = [6]

def clear(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)

def get_stacking_metric(i):
    args.labels_to_substruct = {
        day: os.path.join("targets_to_substruct", str(i), "train_2_test_{}.npy".format(day))
        for day in [3, 6]
    }
    args.additional_features = {
        day: [
            os.path.join("features_models_with_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_with_pos", str(i)))
        ]
        for day in [2, 3, 6]
    }

    args.out_folder = os.path.join(out_folder, str(i))
    os.mkdir(args.out_folder)
    clear(args.metric_folder)    

    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, argmax_positions)

    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
clear(out_folder)
metrics = [get_stacking_metric(i) for i in range(15)]

"2018-06-19 17:40:35.937255": preprocesing started
"2018-06-19 17:40:37.192616": train features shape: (20203, 108)
"2018-06-19 17:40:37.771012": preprocesing finished
"2018-06-19 17:40:37.771103": start training on days [3]
"2018-06-19 17:40:37.771246": using fit without validation
"2018-06-19 17:41:00.726714": built 1000 trees
"2018-06-19 17:41:07.340940": days to predict: [6]
"2018-06-19 17:41:07.341028": features shape: (247423, 108)
"2018-06-19 17:41:07.341151": start predicting on day 6
"2018-06-19 17:41:09.193979": predictions shape: (22493, 11)
"2018-06-19 17:41:09.194151": saveing results
"2018-06-19 17:41:09.195654": results saved
"2018-06-19 17:41:09.196840": predictions_filenames: ['substructed_target_predictions_with_stacked_features/0/train_3_test_6.npy']
"2018-06-19 17:41:09.196878": will evaluate days: [6]
"2018-06-19 17:41:09.197069": predict on file "substructed_target_predictions_with_stacked_features/0/train_3_test_6.npy"
"2018-06-19 17:41:11.209210": calculating me

"2018-06-19 17:45:43.684307": built 1000 trees
"2018-06-19 17:45:50.058045": days to predict: [6]
"2018-06-19 17:45:50.058129": features shape: (247423, 108)
"2018-06-19 17:45:50.058149": start predicting on day 6
"2018-06-19 17:45:51.853655": predictions shape: (22493, 11)
"2018-06-19 17:45:51.853761": saveing results
"2018-06-19 17:45:51.855250": results saved
"2018-06-19 17:45:51.856099": predictions_filenames: ['substructed_target_predictions_with_stacked_features/8/train_3_test_6.npy']
"2018-06-19 17:45:51.856129": will evaluate days: [6]
"2018-06-19 17:45:51.856241": predict on file "substructed_target_predictions_with_stacked_features/8/train_3_test_6.npy"
"2018-06-19 17:45:53.836503": calculating metric
"2018-06-19 17:45:53.837483": preprocesing started
"2018-06-19 17:45:55.072314": train features shape: (20203, 108)
"2018-06-19 17:45:55.650008": preprocesing finished
"2018-06-19 17:45:55.650112": start training on days [3]
"2018-06-19 17:45:55.650148": using fit without valida

In [4]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6171912369703024, 0.0041290245871292534)

In [5]:
metrics

[0.5945714937589157,
 0.611625413401515,
 0.6193556625308347,
 0.6303071435287768,
 0.6276150842204564,
 0.6056460485143074,
 0.6433548582501483,
 0.5884795171993076,
 0.6173769412529464,
 0.6043822023133558,
 0.6120680769440913,
 0.603987135157599,
 0.6390045054828375,
 0.639914557298759,
 0.620179914700685]

In [7]:
x = np.load("substructed_target_predictions_with_stacked_features/0/train_3_test_6.npy")

In [8]:
x

array([[ 0.02892922,  0.02817755,  0.02744592, ...,  0.02459248,
         0.0224645 ,  0.01943165],
       [ 0.05440555,  0.05462927,  0.05530382, ...,  0.0565974 ,
         0.05669397,  0.07880135],
       [ 0.0712822 ,  0.06913913,  0.06984357, ...,  0.0682687 ,
         0.06750267,  0.06922367],
       ...,
       [-0.04123444, -0.04159965, -0.05203849, ..., -0.05359113,
        -0.05403944, -0.04931654],
       [ 0.10316526,  0.10170456,  0.10121779, ...,  0.0954819 ,
         0.09024726,  0.0539463 ],
       [-0.11752782, -0.11526424, -0.11352466, ..., -0.15025934,
        -0.15038904, -0.18019056]])