In [1]:
import numpy as np
import shutil
import os

from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *

from matplotlib import pyplot as plt

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
args = Namespace()
args.verbose = True
args.first_feature = 0
args.last_feature = -1
args.data_folder = "../../../data/best_features_days_data/"
out_folder = "substructed_target_predictions"
args.model_constructor = lambda verbose: CatBoostRegressor(verbose=False)
args.type = "regression"
args.need_position_feature = True
args.metric_folder = "metrics"
args.additional_features = None
args.train_days = [3]
args.validation_day = None
args.test_days = [6]

def clear(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)

def get_stacking_metric(i):
    args.labels_to_substruct = {
        day: os.path.join(
            "classification_targets_to_substruct_max_clicks_5_edited",
            str(i),
            "train_2_test_{}.npy".format(day)
        )
        for day in [3, 6]
    }

    args.out_folder = os.path.join(out_folder, str(i))
    os.mkdir(args.out_folder)
    clear(args.metric_folder)    

    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, argmax_positions)

    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
clear(out_folder)
metrics = [get_stacking_metric(i) for i in range(15)]

"2018-06-20 00:23:43.584034": preprocesing started
"2018-06-20 00:23:44.344154":     base features shape: (20203, 80)
"2018-06-20 00:23:44.347148":     0 features added
"2018-06-20 00:23:44.347224":     result shape: (20203, 80)
"2018-06-20 00:23:44.349866": train features shape: (20203, 80)
"2018-06-20 00:23:44.933449": preprocesing finished
"2018-06-20 00:23:44.933538": start training on days [3]
"2018-06-20 00:23:44.933707": using fit without validation
"2018-06-20 00:24:02.789073": built 1000 trees
"2018-06-20 00:24:04.825129":     base features shape: (247423, 80)
"2018-06-20 00:24:04.837882":     0 features added
"2018-06-20 00:24:04.837981":     result shape: (247423, 80)
"2018-06-20 00:24:04.838419": days to predict: [6]
"2018-06-20 00:24:04.838555": features shape: (247423, 80)
"2018-06-20 00:24:04.838629": start predicting on day 6
"2018-06-20 00:24:06.302407": predictions shape: (22493, 11)
"2018-06-20 00:24:06.302552": saveing results to substructed_target_predictions/0/tra

"2018-06-20 00:26:34.307576":     base features shape: (20203, 80)
"2018-06-20 00:26:34.308865":     0 features added
"2018-06-20 00:26:34.308909":     result shape: (20203, 80)
"2018-06-20 00:26:34.310303": train features shape: (20203, 80)
"2018-06-20 00:26:34.936340": preprocesing finished
"2018-06-20 00:26:34.936442": start training on days [3]
"2018-06-20 00:26:34.936537": using fit without validation
"2018-06-20 00:26:54.208788": built 1000 trees
"2018-06-20 00:26:56.243592":     base features shape: (247423, 80)
"2018-06-20 00:26:56.256079":     0 features added
"2018-06-20 00:26:56.256128":     result shape: (247423, 80)
"2018-06-20 00:26:56.256442": days to predict: [6]
"2018-06-20 00:26:56.256471": features shape: (247423, 80)
"2018-06-20 00:26:56.256485": start predicting on day 6
"2018-06-20 00:26:57.617835": predictions shape: (22493, 11)
"2018-06-20 00:26:57.618148": saveing results to substructed_target_predictions/6/train_3_test_6
"2018-06-20 00:26:57.620640": results s

"2018-06-20 00:29:06.073190":     base features shape: (20203, 80)
"2018-06-20 00:29:06.074431":     0 features added
"2018-06-20 00:29:06.074484":     result shape: (20203, 80)
"2018-06-20 00:29:06.075642": train features shape: (20203, 80)
"2018-06-20 00:29:06.656537": preprocesing finished
"2018-06-20 00:29:06.656638": start training on days [3]
"2018-06-20 00:29:06.656731": using fit without validation
"2018-06-20 00:29:25.110563": built 1000 trees
"2018-06-20 00:29:27.071128":     base features shape: (247423, 80)
"2018-06-20 00:29:27.083536":     0 features added
"2018-06-20 00:29:27.083581":     result shape: (247423, 80)
"2018-06-20 00:29:27.083886": days to predict: [6]
"2018-06-20 00:29:27.083912": features shape: (247423, 80)
"2018-06-20 00:29:27.083927": start predicting on day 6
"2018-06-20 00:29:28.459260": predictions shape: (22493, 11)
"2018-06-20 00:29:28.459400": saveing results to substructed_target_predictions/12/train_3_test_6
"2018-06-20 00:29:28.461139": results 

In [4]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6142251630831954, 0.004043263082383379)

In [5]:
metrics

[0.6179361066530301,
 0.5981686362505645,
 0.603525450713837,
 0.5967365178952907,
 0.625215597635857,
 0.6114059823117595,
 0.5947542701444105,
 0.6512354080382206,
 0.6352721149867452,
 0.6163701616277476,
 0.5970557977130345,
 0.6257605455858035,
 0.6029434169425318,
 0.6126024540392578,
 0.6243949857098403]