In [5]:
import numpy as np
import shutil
import os

import sys
sys.path.append("../utils/")
sys.path.append("../calculating_predictions")
sys.path.append("../evaluation/")
from calculate_predictions import calculate_predictions
from evaluate import evaluate
from run_evaluation import *
from argparse import Namespace
from catboost import CatBoostRegressor, CatBoostClassifier

%reload_ext autoreload
%autoreload 2

In [7]:
def clear(folder):
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)

def get_classification_stacking_metric(i):
    args = Namespace()
    args.verbose = True
    args.first_feature = 0
    args.last_feature = -1
    args.data_folder = "../../../data/best_features_days_data/"
    out_folder = "classification_predictions_max_clicks_3"
    args.type = "classification"
    args.max_clicks = 3
    args.need_position_feature = True
    args.labels_to_substruct = None
    args.metric_folder = "metrics"
    args.model_constructor=lambda verbose, max_clicks: CatBoostClassifier(
        verbose=False,
        loss_function='MultiClass',
        classes_count=max_clicks + 2
    )
    args.additional_features = {
        day: [
            os.path.join("features_models_with_pos", str(i), filename)
            for filename in os.listdir(os.path.join("features_models_with_pos", str(i)))
        ]
        for day in [3, 6]
    }
    args.validation_day = None
    args.train_days = [3]
    args.test_days = [6]

    args.out_folder = os.path.join(out_folder, str(i))        
    clear(args.out_folder)
    clear(args.metric_folder)

    calculate_predictions(args)
    evaluate(args.out_folder, args.data_folder, args.metric_folder, expect_argmax_positions)
    
    with open("metrics/metrics.txt") as handler:
        metric = float(next(handler).strip().split()[0])
        return metric

In [10]:
%%time
metrics = [get_classification_stacking_metric(i) for i in range(15)]

"2018-06-21 11:43:26.252928": preprocesing started
"2018-06-21 11:43:27.045372":     base features shape: (20203, 80)
"2018-06-21 11:43:27.595675":     28 features added
"2018-06-21 11:43:27.595777":     result shape: (20203, 108)
"2018-06-21 11:43:27.598562": train features shape: (20203, 108)
"2018-06-21 11:43:28.255305": preprocesing finished
"2018-06-21 11:43:28.255390": start training on days [3]
"2018-06-21 11:43:28.255580": using fit without validation
"2018-06-21 11:46:14.798724": built 1000 trees
"2018-06-21 11:46:17.011799":     base features shape: (247423, 80)
"2018-06-21 11:46:22.016243":     28 features added
"2018-06-21 11:46:22.016352":     result shape: (247423, 108)
"2018-06-21 11:46:22.027786": days to predict: [6]
"2018-06-21 11:46:22.027870": features shape: (247423, 108)
"2018-06-21 11:46:22.027932": start predicting on day 6
"2018-06-21 11:46:24.176088": predictions shape: (22493, 11, 7)
"2018-06-21 11:46:24.176141": saveing results to res/train_3_test_6
"2018-06

"2018-06-21 12:07:54.724335": preprocesing started
"2018-06-21 12:07:55.510194":     base features shape: (20203, 80)
"2018-06-21 12:07:55.987901":     28 features added
"2018-06-21 12:07:55.988021":     result shape: (20203, 108)
"2018-06-21 12:07:55.990817": train features shape: (20203, 108)
"2018-06-21 12:07:56.590259": preprocesing finished
"2018-06-21 12:07:56.590323": start training on days [3]
"2018-06-21 12:07:56.590338": using fit without validation
"2018-06-21 12:10:51.537456": built 1000 trees
"2018-06-21 12:10:53.812056":     base features shape: (247423, 80)
"2018-06-21 12:10:58.933983":     28 features added
"2018-06-21 12:10:58.934069":     result shape: (247423, 108)
"2018-06-21 12:10:58.944786": days to predict: [6]
"2018-06-21 12:10:58.944842": features shape: (247423, 108)
"2018-06-21 12:10:58.944935": start predicting on day 6
"2018-06-21 12:11:01.089401": predictions shape: (22493, 11, 7)
"2018-06-21 12:11:01.089457": saveing results to res/train_3_test_6
"2018-06

In [11]:
# mc = 5
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6260886234879471, 0.003417892572126529)

In [8]:
%%time
# mc = 3
metrics = [get_classification_stacking_metric(i) for i in range(15)]

"2018-06-21 12:50:57.328869": preprocesing started
"2018-06-21 12:50:58.532786":     base features shape: (20203, 80)
"2018-06-21 12:51:00.038791":     28 features added
"2018-06-21 12:51:00.038903":     result shape: (20203, 108)
"2018-06-21 12:51:00.043354": train features shape: (20203, 108)
"2018-06-21 12:51:00.862713": preprocesing finished
"2018-06-21 12:51:00.862785": start training on days [3]
"2018-06-21 12:51:00.862801": using fit without validation
"2018-06-21 12:53:56.391350": built 1000 trees
"2018-06-21 12:53:58.730146":     base features shape: (247423, 80)
"2018-06-21 12:54:03.944807":     28 features added
"2018-06-21 12:54:03.944895":     result shape: (247423, 108)
"2018-06-21 12:54:03.952324": days to predict: [6]
"2018-06-21 12:54:03.952381": features shape: (247423, 108)
"2018-06-21 12:54:03.952400": start predicting on day 6
"2018-06-21 12:54:06.496703": predictions shape: (22493, 11, 5)
"2018-06-21 12:54:06.496876": saveing results to classification_predictions_

"2018-06-21 13:13:47.393521": preprocesing started
"2018-06-21 13:13:48.148250":     base features shape: (20203, 80)
"2018-06-21 13:13:48.643831":     28 features added
"2018-06-21 13:13:48.643915":     result shape: (20203, 108)
"2018-06-21 13:13:48.645569": train features shape: (20203, 108)
"2018-06-21 13:13:49.277156": preprocesing finished
"2018-06-21 13:13:49.277260": start training on days [3]
"2018-06-21 13:13:49.277359": using fit without validation
"2018-06-21 13:15:51.141708": built 1000 trees
"2018-06-21 13:15:53.225658":     base features shape: (247423, 80)
"2018-06-21 13:15:57.807388":     28 features added
"2018-06-21 13:15:57.807493":     result shape: (247423, 108)
"2018-06-21 13:15:57.813985": days to predict: [6]
"2018-06-21 13:15:57.814031": features shape: (247423, 108)
"2018-06-21 13:15:57.814046": start predicting on day 6
"2018-06-21 13:15:59.903696": predictions shape: (22493, 11, 5)
"2018-06-21 13:15:59.903812": saveing results to classification_predictions_

In [9]:
# mc = 3
np.mean(metrics), np.std(metrics) / np.sqrt(15)

(0.6324788602306322, 0.004553641190728026)