In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import time
import catboost
from sklearn import model_selection
import pathlib

COORDINATES = ["Xmin", "Ymin", "Xmax", "Ymax"]

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [3]:
votes = pd.read_csv("../raw/train_data.csv").set_index("itemId")

In [4]:
answers = pd.read_csv("../raw/train_answers.csv").set_index("itemId")

In [5]:
votes_test = pd.read_csv("../raw/test_data.csv").set_index("itemId")

In [6]:
def make_feat(df):
    counts = df[["userId"]].groupby("itemId").count()
    feat = df.groupby("itemId")[COORDINATES].agg(["min", "max", "std", "median", "mean"])
    feat["count"] = counts
    return feat

In [15]:
SEED = 284702
FOLDS = 5
ITERATIONS = 3000
LEARNING_RATE = 0.03

CLF_PARAMS = dict(
    loss_function="RMSE",
    eval_metric=None,
    random_state=SEED,
    depth=6,
    od_type="Iter",
    od_wait=ITERATIONS // 10,
    verbose=ITERATIONS // 20,
    learning_rate=LEARNING_RATE,
    iterations=ITERATIONS,
    allow_writing_files=False
)

In [8]:
def train_catboost(x_train, y_train, x_test, name, subdir):
    print(f"Прогнозирование {name}")
    pool_test = catboost.Pool(
            data=x_test,
            label=None,
            cat_features=None,
            weight=None
        )
    y_oof = pd.Series(0, index=x_train.index, name=f"oof_{name}")
    y_pred = pd.Series(0, index=x_test.index, name=name)
    trees = []
    scores = []

    for index_train, index_valid in model_selection.KFold(FOLDS, shuffle=True, random_state=SEED).split(x_train):
        pool_train = catboost.Pool(
            data=x_train.iloc[index_train],
            label=y_train.iloc[index_train],
            cat_features=None,
            weight=None
        )
        pool_valid = catboost.Pool(
            data=x_train.iloc[index_valid],
            label=y_train.iloc[index_valid],
            cat_features=None,
            weight=None
        )
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation']['RMSE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / FOLDS

    print(f"Количество деревьев: {sorted(trees)}")
    print(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    print(f"RMSE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    print(f"RMSE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.2f}")

    y_oof.to_csv(f"../processed/{subdir}/oof_{name}.csv", header=True)
    y_pred.to_csv(f"../processed/{subdir}/sub_{name}.csv", header=True)
    return y_pred, np.mean(scores)

In [9]:
def make_forecast():
    subdir = time.strftime('%Y-%m-%d_%H-%M')
    path = pathlib.Path(f"../processed/{subdir}")
    path.mkdir()
    x_train = make_feat(votes)
    y_train = answers.loc[x_train.index]
    x_test = make_feat(votes_test)
    scores = []
    rez = []
    for name in COORDINATES:
        y_pred, score = train_catboost(x_train, y_train[f"{name}_true"], x_test, name, subdir)
        scores.append(score)
        rez.append(y_pred)
    pd.concat(rez, axis=1).to_csv(path / "_sub_full.csv", header=False)
    path.rename(path.parent / f"{subdir}-{np.mean(scores):0.1f}")

In [16]:
make_forecast()

Прогнозирование Xmin
0:	learn: 111.5402728	test: 104.8273471	best: 104.8273471 (0)	total: 7.48ms	remaining: 22.4s
150:	learn: 59.7302110	test: 65.4232606	best: 65.3350909 (121)	total: 1.25s	remaining: 23.6s
300:	learn: 54.1716804	test: 66.2301222	best: 65.3350909 (121)	total: 2.41s	remaining: 21.6s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 65.33509086
bestIteration = 121

Shrink model to first 122 iterations.
0:	learn: 106.8936397	test: 122.9012358	best: 122.9012358 (0)	total: 7.35ms	remaining: 22.1s
150:	learn: 57.1572373	test: 75.7793896	best: 75.7793896 (150)	total: 1.16s	remaining: 21.8s
300:	learn: 50.4323600	test: 74.3909928	best: 74.3164113 (296)	total: 2.31s	remaining: 20.7s
450:	learn: 45.4360621	test: 74.3209401	best: 74.1893924 (331)	total: 3.46s	remaining: 19.6s
600:	learn: 42.5610015	test: 74.6577149	best: 74.1893924 (331)	total: 4.65s	remaining: 18.5s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 74.18939242
bestIteration = 33

1050:	learn: 65.5284168	test: 92.6245479	best: 92.6120558 (1047)	total: 8.47s	remaining: 15.7s
1200:	learn: 63.5370465	test: 92.5302359	best: 92.4892735 (1193)	total: 9.69s	remaining: 14.5s
1350:	learn: 61.7412171	test: 92.5247951	best: 92.3747233 (1231)	total: 10.8s	remaining: 13.2s
1500:	learn: 60.5730849	test: 92.5852149	best: 92.3747233 (1231)	total: 12.1s	remaining: 12.1s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 92.37472335
bestIteration = 1231

Shrink model to first 1232 iterations.
0:	learn: 528.6775341	test: 497.9772674	best: 497.9772674 (0)	total: 12.6ms	remaining: 37.8s
150:	learn: 85.2322542	test: 87.9049935	best: 87.8667516 (149)	total: 1.81s	remaining: 34.1s
300:	learn: 79.8613951	test: 87.7589870	best: 87.7053550 (259)	total: 3.34s	remaining: 30s
450:	learn: 77.2689489	test: 87.6333775	best: 87.5509464 (399)	total: 4.76s	remaining: 26.9s
600:	learn: 74.4180036	test: 87.9735254	best: 87.5509464 (399)	total: 6.31s	remaining: 25.2s
Stopped by overfi

450:	learn: 74.3913105	test: 78.6146793	best: 78.6021754 (445)	total: 3.35s	remaining: 18.9s
600:	learn: 73.7591883	test: 78.5640736	best: 78.5342545 (564)	total: 4.15s	remaining: 16.6s
750:	learn: 73.0323215	test: 78.5844641	best: 78.5316468 (662)	total: 4.74s	remaining: 14.2s
900:	learn: 71.8856082	test: 78.3352004	best: 78.3267976 (895)	total: 5.65s	remaining: 13.2s
1050:	learn: 71.2663307	test: 78.4138419	best: 78.3267976 (895)	total: 6.38s	remaining: 11.8s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 78.32679756
bestIteration = 895

Shrink model to first 896 iterations.
0:	learn: 953.6968372	test: 992.6929427	best: 992.6929427 (0)	total: 8.03ms	remaining: 24.1s
150:	learn: 80.0436728	test: 99.7533269	best: 99.7533269 (150)	total: 1.28s	remaining: 24.2s
300:	learn: 72.4608240	test: 91.7041464	best: 91.7041464 (300)	total: 2.46s	remaining: 22.1s
450:	learn: 70.9598836	test: 90.8655808	best: 90.8655808 (450)	total: 3.51s	remaining: 19.9s
600:	learn: 69.7686438	t