In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import time
import catboost
from sklearn import model_selection
import pathlib

COORDINATES = ["Xmin", "Ymin", "Xmax", "Ymax"]

In [2]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [3]:
votes = pd.read_csv("../raw/train_data.csv").set_index("itemId")

In [4]:
answers = pd.read_csv("../raw/train_answers.csv").set_index("itemId")
answers.columns = COORDINATES

In [5]:
votes_test = pd.read_csv("../raw/test_data.csv").set_index("itemId")

In [6]:
def make_feat(df, df_y=None):
    counts = df[["userId"]].groupby("itemId").count()
    feat = df.groupby("itemId")[COORDINATES].agg(["min", "max", "std", "median", "mean"])
    feat["count"] = counts
    dfs = []
    dfs_y = []
    for name in COORDINATES:
        feat_copy = feat.copy(deep=True)
        feat_copy["name"] = name
        dfs.append(feat_copy)
        if df_y is not None:
            dfs_y.append(df_y[name])
    dfs = pd.concat(dfs, axis=0)
    if df_y is not None:
        dfs_y = pd.concat(dfs_y, axis=0)
    return dfs, dfs_y

In [11]:
SEED = 284702
FOLDS = 5
ITERATIONS = 4000
LEARNING_RATE = 0.03

CLF_PARAMS = dict(
    loss_function="RMSE",
    eval_metric=None,
    random_state=SEED,
    depth=6,
    od_type="Iter",
    od_wait=ITERATIONS // 10,
    verbose=ITERATIONS // 20,
    learning_rate=LEARNING_RATE,
    iterations=ITERATIONS,
    allow_writing_files=False
)

In [8]:
def train_catboost(x_train, y_train, x_test, subdir):
    cat_features = [len(x_train.columns) - 1]
    pool_test = catboost.Pool(
            data=x_test,
            label=None,
            cat_features=cat_features,
            weight=None
        )
    y_oof = pd.Series(0, index=x_train.index, name="oof")
    y_pred = pd.Series(0, index=x_test.index, name="y_pred")
    trees = []
    scores = []

    for index_train, index_valid in model_selection.KFold(FOLDS, shuffle=True, random_state=SEED).split(x_train):
        pool_train = catboost.Pool(
            data=x_train.iloc[index_train],
            label=y_train.iloc[index_train],
            cat_features=cat_features,
            weight=None
        )
        pool_valid = catboost.Pool(
            data=x_train.iloc[index_valid],
            label=y_train.iloc[index_valid],
            cat_features=cat_features,
            weight=None
        )
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation']['RMSE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / FOLDS

    print(f"Количество деревьев: {sorted(trees)}")
    print(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    print(f"RMSE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    print(f"RMSE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.2f}")

    y_oof.to_csv(f"../processed/{subdir}/oof.csv", header=True)
    y_pred.to_csv(f"../processed/{subdir}/sub.csv", header=True)
    return y_pred, np.mean(scores)

In [9]:
def make_forecast():
    subdir = time.strftime('%Y-%m-%d_%H-%M')
    path = pathlib.Path(f"../processed/{subdir}")
    path.mkdir()
    x_train, y_train = make_feat(votes, answers)
    x_test, _ = make_feat(votes_test)
    y_pred, score = train_catboost(x_train, y_train, x_test, subdir)
    size = len(x_test) // 4
    rez = []
    for i in range(4):
        rez.append(y_pred.iloc[size * i: size * (i + 1)])
    pd.concat(rez, axis=1).to_csv(path / "_sub_full.csv", header=False)
    path.rename(path.parent / f"{subdir}-{score:0.1f}")

In [12]:
make_forecast()

0:	learn: 634.7915679	test: 667.9913942	best: 667.9913942 (0)	total: 9.21ms	remaining: 36.8s
200:	learn: 83.0897683	test: 84.4844400	best: 84.4844400 (200)	total: 3s	remaining: 56.6s
400:	learn: 78.9055187	test: 82.0023810	best: 81.9940437 (396)	total: 4.98s	remaining: 44.7s
600:	learn: 77.0290439	test: 81.2278620	best: 81.2266558 (599)	total: 6.79s	remaining: 38.4s
800:	learn: 75.4022043	test: 80.9274766	best: 80.8951751 (782)	total: 8.44s	remaining: 33.7s
1000:	learn: 74.5857814	test: 80.6427810	best: 80.6427779 (998)	total: 9.71s	remaining: 29.1s
1200:	learn: 73.8809828	test: 80.4827608	best: 80.4678969 (1190)	total: 11.1s	remaining: 25.9s
1400:	learn: 72.8718035	test: 80.2986125	best: 80.2911276 (1394)	total: 12.8s	remaining: 23.7s
1600:	learn: 72.1810662	test: 80.2737008	best: 80.2677634 (1599)	total: 14.5s	remaining: 21.7s
1800:	learn: 71.6037481	test: 80.3632608	best: 80.2677634 (1599)	total: 16.2s	remaining: 19.8s
Stopped by overfitting detector  (400 iterations wait)

bestTest