In [35]:
%matplotlib inline
import numpy as np
import pandas as pd
import time
import catboost
from sklearn import model_selection
import pathlib

COORDINATES = ["Xmin", "Ymin", "Xmax", "Ymax"]

In [36]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [37]:
votes = pd.read_csv("../raw/train_data.csv").set_index("itemId")

In [38]:
answers = pd.read_csv("../raw/train_answers.csv").set_index("itemId")
answers.columns = COORDINATES

In [39]:
votes_test = pd.read_csv("../raw/test_data.csv").set_index("itemId")

In [40]:
def make_feat(df, df_y=None):
    counts = df[["userId"]].groupby("itemId").count()
    feat = df.groupby("itemId")[COORDINATES].agg(["min", "max", "std", "median", "mean"])
    feat["count"] = counts
    dfs = []
    dfs_y = []
    for name in COORDINATES:
        feat_copy = feat.copy(deep=True)
        feat_copy["name"] = name
        dfs.append(feat_copy)
        if df_y is not None:
            dfs_y.append(df_y[name])
    dfs = pd.concat(dfs, axis=0)
    if df_y is not None:
        dfs_y = pd.concat(dfs_y, axis=0)
    return dfs, dfs_y

In [60]:
SEED = 284702
FOLDS = 5
ITERATIONS = 20000
LEARNING_RATE = 0.5

CLF_PARAMS = dict(
    loss_function="MAE",
    eval_metric=None,
    random_state=SEED,
    depth=6,
    od_type="Iter",
    od_wait=ITERATIONS // 10,
    verbose=ITERATIONS // 20,
    learning_rate=LEARNING_RATE,
    iterations=ITERATIONS,
    allow_writing_files=False
)

In [55]:
def train_catboost(x_train, y_train, x_test, subdir):
    cat_features = [len(x_train.columns) - 1]
    pool_test = catboost.Pool(
            data=x_test,
            label=None,
            cat_features=cat_features,
            weight=None
        )
    y_oof = pd.Series(0, index=x_train.index, name="oof")
    y_pred = pd.Series(0, index=x_test.index, name="y_pred")
    trees = []
    scores = []

    for index_train, index_valid in model_selection.KFold(FOLDS, shuffle=True, random_state=SEED).split(x_train):
        pool_train = catboost.Pool(
            data=x_train.iloc[index_train],
            label=y_train.iloc[index_train],
            cat_features=cat_features,
            weight=None
        )
        pool_valid = catboost.Pool(
            data=x_train.iloc[index_valid],
            label=y_train.iloc[index_valid],
            cat_features=cat_features,
            weight=None
        )
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation']['MAE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / FOLDS

    print(f"Количество деревьев: {sorted(trees)}")
    print(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    print(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    print(f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.2f}")

    y_oof.to_csv(f"../processed/{subdir}/oof.csv", header=True)
    y_pred.to_csv(f"../processed/{subdir}/sub.csv", header=True)
    return y_pred, np.mean(scores)

In [57]:
def make_forecast():
    subdir = time.strftime('%Y-%m-%d_%H-%M')
    path = pathlib.Path(f"../processed/{subdir}")
    path.mkdir()
    x_train, y_train = make_feat(votes, answers)
    x_test, _ = make_feat(votes_test)
    y_pred, score = train_catboost(x_train, y_train, x_test, subdir)
    size = len(x_test) // 4
    rez = []
    for i in range(4):
        rez.append(y_pred.iloc[size * i: size * (i + 1)])
    pd.concat(rez, axis=1).to_csv(path / "_sub_full.csv", header=False)
    path.rename(path.parent / f"{subdir}-{score:0.1f}")

In [61]:
make_forecast()

0:	learn: 522.8282518	test: 561.2475209	best: 561.2475209 (0)	total: 18.5ms	remaining: 6m 10s
1000:	learn: 343.7461709	test: 373.6502066	best: 373.6502066 (1000)	total: 11.8s	remaining: 3m 43s
2000:	learn: 207.2686248	test: 229.8072278	best: 229.8072278 (2000)	total: 23.5s	remaining: 3m 31s
3000:	learn: 120.1111157	test: 133.8745711	best: 133.8745711 (3000)	total: 35.3s	remaining: 3m 19s
4000:	learn: 74.6032320	test: 82.3915863	best: 82.3915863 (4000)	total: 48.2s	remaining: 3m 12s
5000:	learn: 56.9565174	test: 63.5323659	best: 63.5323659 (5000)	total: 1m	remaining: 3m
6000:	learn: 51.4104466	test: 58.5949877	best: 58.5949877 (6000)	total: 1m 11s	remaining: 2m 47s
7000:	learn: 48.9083984	test: 56.4382644	best: 56.4382644 (7000)	total: 1m 23s	remaining: 2m 35s
8000:	learn: 47.2272763	test: 55.3060355	best: 55.3060355 (8000)	total: 1m 35s	remaining: 2m 23s
9000:	learn: 46.0848154	test: 54.7987140	best: 54.7979764 (8997)	total: 1m 47s	remaining: 2m 11s
10000:	learn: 45.2318840	test: 54.50

Количество деревьев: [9912, 10819, 12080, 14975, 19322]
Среднее количество деревьев: 13422 +/- 3409
MAE на кроссвалидации: [50.71607 52.07535 52.57703 54.38364 54.86593]
MAE среднее: 52.924 +/- 1.52
