In [1]:
import pandas as pd
import numpy as np
import tqdm
from scipy import signal
from scipy import stats
import catboost


TEST_SIZE = 150000

In [2]:
def load_data():
    df = pd.read_csv(
            "../input/train.csv",
            names=["x", "y"],
            skiprows=1,
            dtype={"x": "float32", "y": "float32"}
        )
    return df

BIG_FRAME = load_data()

In [3]:
def make_features(df_x):
    feat = dict()
    mean_abs = (df_x - df_x.mean()).abs()
    feat["mean_abs_med"] = mean_abs.median()

    roll_std = df_x.rolling(375).std().dropna()
    feat["std_roll_med_375"] = roll_std.median()

    half = len(roll_std) // 2
    feat["std_roll_half1"] = roll_std.iloc[:half].median()
    feat["std_roll_half2"] = roll_std.iloc[-half:].median()

    welch = signal.welch(df_x)[1]
    for num in [2, 3, 28, 30]:
        feat[f"welch_{num}"] = welch[num]

    feat["ave10"] = stats.trim_mean(df_x, 0.1)

    feat["q05_roll_std_25"] = df_x.rolling(25).std().dropna().quantile(0.05)
    feat["q05_roll_std_375"] = df_x.rolling(375).std().dropna().quantile(0.05)
    feat["q05_roll_std_1500"] = df_x.rolling(1500).std().dropna().quantile(0.05)
    feat["q05_roll_std_1000"] = df_x.rolling(1000).std().dropna().quantile(0.05)
    feat["q01_roll_mean_1500"] = df_x.rolling(1500).mean().dropna().quantile(0.01)
    feat["q99_roll_mean_1500"] = df_x.rolling(1500).mean().dropna().quantile(0.99)

    return feat

In [4]:
def make_train_set():
    df = BIG_FRAME
    data = []
    df_x = df.x
    df_y = df.y
    for loc_end in tqdm.tqdm_notebook(df.index[TEST_SIZE::TEST_SIZE]):
        first_time = df_y.iloc[loc_end - TEST_SIZE]
        last_time = df_y.iloc[loc_end]
        if first_time < last_time:
            continue
        feat = make_features(df_x.iloc[loc_end - TEST_SIZE:loc_end])
        feat["y"] = last_time
        data.append(feat)
    data = pd.DataFrame(data).sort_index(axis=1)
    data = data.sort_values("y").reset_index(drop=True)
    return data.drop(["y"], axis=1), data.y

TRAIN_SET = make_train_set()

HBox(children=(IntProgress(value=0, max=4194), HTML(value='')))

  return np.mean(atmp[sl], axis=axis)





In [5]:
def make_test_set():
    data = []
    seg_id = pd.read_csv(
        "../input/sample_submission.csv"
    ).seg_id
    for name in tqdm.tqdm_notebook(seg_id):
        df = pd.read_csv(
            f"../input/test/{name}.csv",
            names=["x"],
            skiprows=1
        )
        feat = make_features(df.x)
        data.append(feat)
    data = pd.DataFrame(data).sort_index(axis=1)
    data.index = seg_id
    return data

TEST_SET = make_test_set()

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))

  return np.mean(atmp[sl], axis=axis)





In [6]:
SEED = 284702
FOLDS = 13
ITERATIONS = 8000
LEARNING_RATE = 0.03

CLF_PARAMS = dict(
    loss_function="MAE",
    eval_metric=None,
    random_state=SEED,
    depth=6,
    od_type="Iter",
    od_wait=ITERATIONS // 10,
    verbose=ITERATIONS // 20,
    learning_rate=LEARNING_RATE,
    iterations=ITERATIONS,
    allow_writing_files=False,
    task_type="GPU"
)

In [7]:
def train_catboost():
    x_train, y_train = TRAIN_SET
    x_test = TEST_SET
    pool_test = catboost.Pool(
            data=x_test,
            label=None,
            cat_features=None,
            weight=None
        )
    y_oof = pd.Series(0, index=x_train.index, name="oof_y")
    y_pred = pd.Series(0, index=x_test.index, name="time_to_failure")
    trees = []
    scores = []

    for fold in range(FOLDS):
        index_valid = x_train.index[fold::FOLDS]
        index_train = x_train.index.difference(index_valid)
        pool_train = catboost.Pool(
            data=x_train.loc[index_train],
            label=y_train.loc[index_train],
            cat_features=None,
            weight=None
        )
        pool_valid = catboost.Pool(
            data=x_train.loc[index_valid],
            label=y_train.loc[index_valid],
            cat_features=None,
            weight=None
        )
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation_0']['MAE'])
        y_oof.loc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / FOLDS

    print(f"Количество деревьев: {sorted(trees)}")
    print(f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}")
    print(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    print(f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.3f}")

    y_oof.to_csv(f"oof_mae-{np.mean(scores):0.3f}.csv", header=True)
    y_pred.to_csv(f"sub_mae-{np.mean(scores):0.3f}.csv", header=True)

In [8]:
train_catboost()

0:	learn: 5.6487824	test: 5.6454809	best: 5.6454809 (0)	total: 13.3ms	remaining: 1m 46s
400:	learn: 2.4930412	test: 2.5477594	best: 2.5477594 (400)	total: 4.09s	remaining: 1m 17s
800:	learn: 1.9274369	test: 2.0483273	best: 2.0483273 (800)	total: 8.27s	remaining: 1m 14s
1200:	learn: 1.8633500	test: 2.0047192	best: 2.0047192 (1200)	total: 12.4s	remaining: 1m 10s
1600:	learn: 1.8234658	test: 1.9878798	best: 1.9878798 (1600)	total: 16.5s	remaining: 1m 5s
2000:	learn: 1.7919087	test: 1.9741865	best: 1.9741844 (1999)	total: 20.6s	remaining: 1m 1s
2400:	learn: 1.7644120	test: 1.9646265	best: 1.9646091 (2399)	total: 24.8s	remaining: 57.7s
2800:	learn: 1.7412929	test: 1.9595725	best: 1.9593821 (2797)	total: 28.9s	remaining: 53.6s
3200:	learn: 1.7188725	test: 1.9557180	best: 1.9556305 (3198)	total: 33s	remaining: 49.5s
3600:	learn: 1.6996338	test: 1.9526187	best: 1.9526187 (3600)	total: 37.1s	remaining: 45.4s
4000:	learn: 1.6805677	test: 1.9498862	best: 1.9497103 (3963)	total: 41.3s	remaining: 4