In [81]:
%matplotlib inline
import numpy as np
import pandas as pd
import time
import catboost
from sklearn import model_selection
import pathlib

COORDINATES = ["Xmin", "Ymin", "Xmax", "Ymax"]

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [12]:
votes = pd.read_csv("../raw/train_data.csv").set_index("itemId")

In [14]:
answers = pd.read_csv("../raw/train_answers.csv").set_index("itemId")

In [17]:
votes_test = pd.read_csv("../raw/test_data.csv").set_index("itemId")

In [46]:
def make_feat(df):
    counts = df[["userId"]].groupby("itemId").count()
    feat = df.groupby("itemId")[COORDINATES].agg(["min", "max", "std", "median", "mean"])
    feat["count"] = counts
    return feat

In [120]:
SEED = 284702
FOLDS = 5
ITERATIONS = 20000
LEARNING_RATE = 0.5

CLF_PARAMS = dict(
    loss_function="MAE",
    eval_metric=None,
    random_state=SEED,
    depth=6,
    od_type="Iter",
    od_wait=ITERATIONS // 10,
    verbose=ITERATIONS // 20,
    learning_rate=LEARNING_RATE,
    iterations=ITERATIONS,
    allow_writing_files=False
)

In [100]:
def train_catboost(x_train, y_train, x_test, name, subdir):
    print(f"Прогнозирование {name}")
    pool_test = catboost.Pool(
            data=x_test,
            label=None,
            cat_features=None,
            weight=None
        )
    y_oof = pd.Series(0, index=x_train.index, name=f"oof_{name}")
    y_pred = pd.Series(0, index=x_test.index, name=name)
    trees = []
    scores = []

    for index_train, index_valid in model_selection.KFold(FOLDS, shuffle=True, random_state=SEED).split(x_train):
        pool_train = catboost.Pool(
            data=x_train.iloc[index_train],
            label=y_train.iloc[index_train],
            cat_features=None,
            weight=None
        )
        pool_valid = catboost.Pool(
            data=x_train.iloc[index_valid],
            label=y_train.iloc[index_valid],
            cat_features=None,
            weight=None
        )
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation']['MAE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / FOLDS

    print(f"Количество деревьев: {sorted(trees)}")
    print(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    print(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    print(f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.2f}")

    y_oof.to_csv(f"../processed/{subdir}/oof_{name}.csv", header=True)
    y_pred.to_csv(f"../processed/{subdir}/sub_{name}.csv", header=True)
    return y_pred, np.mean(scores)

In [113]:
def make_forecast():
    subdir = time.strftime('%Y-%m-%d_%H-%M')
    path = pathlib.Path(f"../processed/{subdir}")
    path.mkdir()
    x_train = make_feat(votes)
    y_train = answers.loc[x_train.index]
    x_test = make_feat(votes_test)
    scores = []
    rez = []
    for name in COORDINATES:
        y_pred, score = train_catboost(x_train, y_train[f"{name}_true"], x_test, name, subdir)
        scores.append(score)
        rez.append(y_pred)
    pd.concat(rez, axis=1).to_csv(path / "_sub_full.csv", header=False)
    path.rename(path.parent / f"{subdir}-{np.mean(scores):0.1f}")

In [121]:
make_forecast()

Прогнозирование Xmin
0:	learn: 84.3637411	test: 76.6505328	best: 76.6505328 (0)	total: 7.32ms	remaining: 2m 26s
1000:	learn: 42.3355144	test: 45.1991929	best: 45.1991929 (1000)	total: 7.51s	remaining: 2m 22s
2000:	learn: 36.7080378	test: 45.0026379	best: 44.8625122 (1500)	total: 14.9s	remaining: 2m 13s
3000:	learn: 32.7224531	test: 45.4627290	best: 44.8625122 (1500)	total: 22.2s	remaining: 2m 5s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 44.86251216
bestIteration = 1500

Shrink model to first 1501 iterations.
0:	learn: 80.8744827	test: 90.5627668	best: 90.5627668 (0)	total: 7.4ms	remaining: 2m 27s
1000:	learn: 39.8712803	test: 54.9443583	best: 54.9443583 (1000)	total: 7.34s	remaining: 2m 19s
2000:	learn: 34.4586841	test: 52.9336084	best: 52.9336084 (2000)	total: 14.8s	remaining: 2m 12s
3000:	learn: 30.5026433	test: 52.4763524	best: 52.4680139 (2985)	total: 22.3s	remaining: 2m 6s
4000:	learn: 27.2498395	test: 52.2046414	best: 52.2022569 (3997)	total: 29.5s	remai

4000:	learn: 37.2772022	test: 50.4438913	best: 50.4438913 (4000)	total: 27.1s	remaining: 1m 48s
5000:	learn: 25.8432286	test: 41.9518718	best: 41.9518718 (5000)	total: 34.5s	remaining: 1m 43s
6000:	learn: 22.4734587	test: 40.9982327	best: 40.9948449 (5991)	total: 42s	remaining: 1m 37s
7000:	learn: 19.4820602	test: 40.2268867	best: 40.2216962 (6969)	total: 49.6s	remaining: 1m 32s
8000:	learn: 17.6052455	test: 40.1449840	best: 40.1252593 (7832)	total: 57s	remaining: 1m 25s
9000:	learn: 16.0554806	test: 40.0658837	best: 40.0354366 (8492)	total: 1m 4s	remaining: 1m 18s
10000:	learn: 14.6231287	test: 40.1342563	best: 40.0354366 (8492)	total: 1m 12s	remaining: 1m 12s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 40.03543655
bestIteration = 8492

Shrink model to first 8493 iterations.
0:	learn: 624.3892528	test: 636.2415968	best: 636.2415968 (0)	total: 10.9ms	remaining: 3m 37s
1000:	learn: 386.4043786	test: 398.3192805	best: 398.3192805 (1000)	total: 4.38s	remaining: 1m 

Stopped by overfitting detector  (2000 iterations wait)

bestTest = 72.21508089
bestIteration = 9306

Shrink model to first 9307 iterations.
Количество деревьев: [4646, 4758, 6061, 9307, 11114]
Среднее количество деревьев: 7177 +/- 2590
MAE на кроссвалидации: [58.40448 62.96144 64.96486 67.34195 72.21508]
MAE среднее: 65.178 +/- 4.58
Прогнозирование Ymax
0:	learn: 949.3472682	test: 930.7491156	best: 930.7491156 (0)	total: 19.5ms	remaining: 6m 29s
1000:	learn: 700.9074348	test: 682.1418636	best: 682.1418636 (1000)	total: 1.6s	remaining: 30.3s
2000:	learn: 454.1797651	test: 434.9203342	best: 434.9203342 (2000)	total: 3.5s	remaining: 31.5s
3000:	learn: 253.7853728	test: 241.2732033	best: 241.2732033 (3000)	total: 10.6s	remaining: 59.8s
4000:	learn: 124.6195644	test: 127.6316599	best: 127.6316599 (4000)	total: 17.9s	remaining: 1m 11s
5000:	learn: 69.0539334	test: 83.6209563	best: 83.6209563 (5000)	total: 25.1s	remaining: 1m 15s
6000:	learn: 50.6548237	test: 71.1408550	best: 71.1408550 (600

1000:	learn: 690.8473568	test: 722.8887172	best: 722.8887172 (1000)	total: 1.48s	remaining: 28.1s
2000:	learn: 443.7119290	test: 477.0325963	best: 477.0325963 (2000)	total: 3.15s	remaining: 28.3s
3000:	learn: 245.2629440	test: 278.0770533	best: 278.0770533 (3000)	total: 9.81s	remaining: 55.6s
4000:	learn: 119.2212914	test: 150.2945986	best: 150.2945986 (4000)	total: 17.1s	remaining: 1m 8s
5000:	learn: 65.1732893	test: 97.7733025	best: 97.7733025 (5000)	total: 24.4s	remaining: 1m 13s
6000:	learn: 48.5819169	test: 77.2645121	best: 77.2645121 (6000)	total: 31.5s	remaining: 1m 13s
7000:	learn: 39.4807267	test: 67.6477484	best: 67.6477484 (7000)	total: 38.7s	remaining: 1m 11s
8000:	learn: 35.3220274	test: 64.9152178	best: 64.9152178 (8000)	total: 46s	remaining: 1m 8s
9000:	learn: 32.6488635	test: 64.0398520	best: 64.0329750 (8991)	total: 53.2s	remaining: 1m 4s
10000:	learn: 30.3924897	test: 63.8382973	best: 63.8157001 (9777)	total: 1m	remaining: 1m
11000:	learn: 28.4686048	test: 63.7448379	