In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import time
import catboost
from sklearn import model_selection
import pathlib

COORDINATES = ["Xmin", "Ymin", "Xmax", "Ymax"]

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [3]:
votes = pd.read_csv("../raw/train_data.csv").set_index("itemId")

In [4]:
answers = pd.read_csv("../raw/train_answers.csv").set_index("itemId")
answers.columns = COORDINATES

In [5]:
votes_test = pd.read_csv("../raw/test_data.csv").set_index("itemId")

In [6]:
def make_feat(df, df_y=None):
    counts = df[["userId"]].groupby("itemId").count()
    feat = df.groupby("itemId")[COORDINATES].agg(["min", "max", "std", "median", "mean"])
    feat["count"] = counts
    dfs = []
    dfs_y = []
    for name in COORDINATES:
        feat_copy = feat.copy(deep=True)
        feat_copy["name"] = name
        dfs.append(feat_copy)
        if df_y is not None:
            dfs_y.append(df_y[name])
    dfs = pd.concat(dfs, axis=0)
    if df_y is not None:
        dfs_y = pd.concat(dfs_y, axis=0)
    return dfs, dfs_y

In [7]:
SEED = 284702
FOLDS = 5
ITERATIONS = 100000
LEARNING_RATE = 0.1

CLF_PARAMS = dict(
    loss_function="MAE",
    eval_metric=None,
    random_state=SEED,
    depth=6,
    od_type="Iter",
    od_wait=ITERATIONS // 10,
    verbose=ITERATIONS // 20,
    learning_rate=LEARNING_RATE,
    iterations=ITERATIONS,
    allow_writing_files=False
)

In [8]:
def train_catboost(x_train, y_train, x_test, subdir):
    cat_features = [len(x_train.columns) - 1]
    pool_test = catboost.Pool(
            data=x_test,
            label=None,
            cat_features=cat_features,
            weight=None
        )
    y_oof = pd.Series(0, index=x_train.index, name="oof")
    y_pred = pd.Series(0, index=x_test.index, name="y_pred")
    trees = []
    scores = []

    for index_train, index_valid in model_selection.KFold(FOLDS, shuffle=True, random_state=SEED).split(x_train):
        pool_train = catboost.Pool(
            data=x_train.iloc[index_train],
            label=y_train.iloc[index_train],
            cat_features=cat_features,
            weight=None
        )
        pool_valid = catboost.Pool(
            data=x_train.iloc[index_valid],
            label=y_train.iloc[index_valid],
            cat_features=cat_features,
            weight=None
        )
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation']['MAE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / FOLDS

    print(f"Количество деревьев: {sorted(trees)}")
    print(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    print(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    print(f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.2f}")

    y_oof.to_csv(f"../processed/{subdir}/oof.csv", header=True)
    y_pred.to_csv(f"../processed/{subdir}/sub.csv", header=True)
    return y_pred, np.mean(scores)

In [9]:
def make_forecast():
    subdir = time.strftime('%Y-%m-%d_%H-%M')
    path = pathlib.Path(f"../processed/{subdir}")
    path.mkdir()
    x_train, y_train = make_feat(votes, answers)
    x_test, _ = make_feat(votes_test)
    y_pred, score = train_catboost(x_train, y_train, x_test, subdir)
    size = len(x_test) // 4
    rez = []
    for i in range(4):
        rez.append(y_pred.iloc[size * i: size * (i + 1)])
    pd.concat(rez, axis=1).to_csv(path / "_sub_full.csv", header=False)
    path.rename(path.parent / f"{subdir}-{score:0.1f}")

In [10]:
make_forecast()

0:	learn: 523.0218651	test: 561.4404976	best: 561.4404976 (0)	total: 71.7ms	remaining: 1h 59m 31s
5000:	learn: 343.8341335	test: 373.5539964	best: 373.5539964 (5000)	total: 54.9s	remaining: 17m 23s
10000:	learn: 207.4893351	test: 229.7584983	best: 229.7584983 (10000)	total: 1m 52s	remaining: 16m 48s
15000:	learn: 120.2361265	test: 133.7386259	best: 133.7386259 (15000)	total: 2m 48s	remaining: 15m 53s
20000:	learn: 74.7641515	test: 82.2416344	best: 82.2416344 (20000)	total: 3m 45s	remaining: 15m 3s
25000:	learn: 57.0425464	test: 63.4444803	best: 63.4444803 (25000)	total: 4m 43s	remaining: 14m 8s
30000:	learn: 51.5094258	test: 58.5093368	best: 58.5093368 (30000)	total: 5m 40s	remaining: 13m 13s
35000:	learn: 48.8764935	test: 56.2424234	best: 56.2424234 (35000)	total: 6m 37s	remaining: 12m 18s
40000:	learn: 47.1874248	test: 55.1559454	best: 55.1557730 (39998)	total: 7m 34s	remaining: 11m 21s
45000:	learn: 46.0826515	test: 54.6969503	best: 54.6967887 (44998)	total: 8m 32s	remaining: 10m 26