In [1]:
import numpy as np
import shutil
import os

from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *

from matplotlib import pyplot as plt

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
args = Namespace()
args.verbose = True
args.first_feature = 0
args.last_feature = -1
args.data_folder = "../../../data/best_features_days_data/"
out_folder = "substructed_target_predictions_with_stacked_features"
args.model_constructor = lambda verbose: CatBoostRegressor(verbose=False)
args.type = "regression"
args.need_position_feature = True
args.metric_folder = "metrics"
args.train_days = [3]
args.validation_day = None
args.test_days = [6]

def clear(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)

def get_stacking_metric(i):
    args.labels_to_substruct = {
        day: os.path.join(
            "classification_targets_to_substruct_max_clicks_5_edited",
            str(i),
            "train_2_test_{}.npy".format(day)
        )
        for day in [3, 6]
    }
    args.additional_features = {
        day: [
            os.path.join("features_models_with_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_with_pos", str(i)))
        ]
        for day in [2, 3, 6]
    }

    args.out_folder = os.path.join(out_folder, str(i))
    os.mkdir(args.out_folder)
    clear(args.metric_folder)    

    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, argmax_positions)

    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [3]:
clear(out_folder)
metrics = [get_stacking_metric(i) for i in range(15)]

"2018-06-20 00:41:40.351933": preprocesing started
"2018-06-20 00:41:41.114171":     base features shape: (20203, 80)
"2018-06-20 00:41:42.170437":     28 features added
"2018-06-20 00:41:42.170551":     result shape: (20203, 108)
"2018-06-20 00:41:42.174259": train features shape: (20203, 108)
"2018-06-20 00:41:42.753373": preprocesing finished
"2018-06-20 00:41:42.753472": start training on days [3]
"2018-06-20 00:41:42.753570": using fit without validation
"2018-06-20 00:42:05.062526": built 1000 trees
"2018-06-20 00:42:07.164793":     base features shape: (247423, 80)
"2018-06-20 00:42:11.683578":     28 features added
"2018-06-20 00:42:11.683667":     result shape: (247423, 108)
"2018-06-20 00:42:11.684046": days to predict: [6]
"2018-06-20 00:42:11.684076": features shape: (247423, 108)
"2018-06-20 00:42:11.684091": start predicting on day 6
"2018-06-20 00:42:13.494004": predictions shape: (22493, 11)
"2018-06-20 00:42:13.494188": saveing results to substructed_target_predictions

"2018-06-20 00:45:11.796220": calculating metric
"2018-06-20 00:45:11.834143": preprocesing started
"2018-06-20 00:45:12.591276":     base features shape: (20203, 80)
"2018-06-20 00:45:13.571161":     28 features added
"2018-06-20 00:45:13.571265":     result shape: (20203, 108)
"2018-06-20 00:45:13.573143": train features shape: (20203, 108)
"2018-06-20 00:45:14.168075": preprocesing finished
"2018-06-20 00:45:14.168161": start training on days [3]
"2018-06-20 00:45:14.168182": using fit without validation
"2018-06-20 00:45:36.434401": built 1000 trees
"2018-06-20 00:45:38.528777":     base features shape: (247423, 80)
"2018-06-20 00:45:43.155475":     28 features added
"2018-06-20 00:45:43.155585":     result shape: (247423, 108)
"2018-06-20 00:45:43.155963": days to predict: [6]
"2018-06-20 00:45:43.155990": features shape: (247423, 108)
"2018-06-20 00:45:43.156005": start predicting on day 6
"2018-06-20 00:45:44.951807": predictions shape: (22493, 11)
"2018-06-20 00:45:44.951967": 

"2018-06-20 00:48:47.156564": calculating metric
"2018-06-20 00:48:47.195445": preprocesing started
"2018-06-20 00:48:47.939464":     base features shape: (20203, 80)
"2018-06-20 00:48:49.054298":     28 features added
"2018-06-20 00:48:49.054439":     result shape: (20203, 108)
"2018-06-20 00:48:49.055924": train features shape: (20203, 108)
"2018-06-20 00:48:49.715013": preprocesing finished
"2018-06-20 00:48:49.715106": start training on days [3]
"2018-06-20 00:48:49.715124": using fit without validation
"2018-06-20 00:49:12.139435": built 1000 trees
"2018-06-20 00:49:14.244970":     base features shape: (247423, 80)
"2018-06-20 00:49:19.470462":     28 features added
"2018-06-20 00:49:19.470550":     result shape: (247423, 108)
"2018-06-20 00:49:19.471155": days to predict: [6]
"2018-06-20 00:49:19.471185": features shape: (247423, 108)
"2018-06-20 00:49:19.471200": start predicting on day 6
"2018-06-20 00:49:21.319686": predictions shape: (22493, 11)
"2018-06-20 00:49:21.319786": 

In [4]:
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6192930479413834, 0.004138098368460119)

In [5]:
metrics

[0.617743291912716,
 0.6096255003872714,
 0.6116412120701029,
 0.6027844755958481,
 0.6342387421981568,
 0.6453594978464285,
 0.6201429622780408,
 0.6543979831210561,
 0.5936527401038191,
 0.6205416810025397,
 0.6225770455947988,
 0.5999984828646345,
 0.606823481516028,
 0.6287304765755546,
 0.6211381460537566]