In [2]:
import pandas as pd
import lightgbm as lgb
from datetime import datetime, timedelta
import tqdm
import numpy as np
from sklearn.metrics import f1_score
np.random.seed(4242)

In [4]:
def parse_dt(s):
    return datetime.strptime(s, '%Y-%m-%d %H:%M:%S')

df = pd.read_csv('data/task1/dataset/dataset.csv',  encoding='cp1251')
df = df.fillna(0)
df['date'] = df['date'].apply(parse_dt)

In [4]:
stops = pd.read_csv('data/task1/dataset/stops.csv', parse_dates=['date'])

In [5]:
stops

Unnamed: 0,date,type
0,2017-11-01 11:03:00,stop
1,2017-01-30 08:51:00,stop
2,2017-02-24 17:45:00,stop
3,2017-07-03 00:16:00,stop
4,2017-08-03 11:23:00,vibrosito
5,2017-08-03 15:56:00,vibrosito
6,2017-10-03 17:55:00,stop
7,2017-03-31 01:42:00,vibrosito
8,2017-01-04 07:45:00,stop
9,2017-03-04 20:48:00,stop


In [24]:
def make_range(df, stop_dt):
    return (df['date'] >= (stop_dt - timedelta(hours=1))) & (df['date'] < stop_dt)

df_no_stops = df.copy()
for stop_dt in tqdm.tqdm(df_y['date']):
    df_no_stops = df_no_stops[~make_range(df_no_stops, stop_dt)]

100%|██████████| 66/66 [02:19<00:00,  2.11s/it]


In [None]:
def make_features(df):
    vecs = []
    columns = df.columns[df.columns != 'date']
    for period in [1, 5, 10, 20, 30, 100, 360]:
        mean = df[-period:][columns].mean(axis=0).values
        min = df[-period:][columns].min(axis=0).values
        max = df[-period:][columns].max(axis=0).values
        std = df[-period:][columns].std(axis=0).values
        median = df[-period:][columns].median(axis=0).values
        feats = np.hstack([mean, min, max, std, median])
        vecs.append(feats)
    
    return np.hstack(vecs)

def make_false(df_no_stops):
    random_work = np.random.choice(df_no_stops.shape[0], 10000)
    xs = []
    ys = []
    
    for work in tqdm.tqdm(random_work):
        rng = df_no_stops[make_range(df_no_stops, df_no_stops.iloc[work]['date'])]
        feature_vector = make_features(rng)
        xs.append(feature_vector)
        ys.append(0)
    
    return np.array(xs), np.array(ys)

X_false, y_false = make_false(df_no_stops)

 14%|█▎        | 1367/10000 [02:24<15:10,  9.48it/s]

In [11]:
def make_true(df_with_stops, df_y):
    xs = []
    ys = []    
    for stop_dt in df_y['date'][:-1]: # the last date is out of df range
        rng = df_with_stops[make_range(df_with_stops, stop_dt)]
        feature_vector = make_features(rng)
        xs.append(feature_vector)
        ys.append(1)
    return np.array(xs), np.array(ys)

X_true, y_true = make_true(df, df_y)

In [12]:
X = np.vstack([X_false, X_true])
y = np.hstack([y_false, y_true]).T

In [14]:
def make_train_val(X, y):
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=3)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        yield X_train, y_train, X_test, y_test

In [25]:
f_scores = []
for X_train, y_train, X_val, y_val in make_train_val(X, y):
    MAX_ROUNDS = 1000
    params = {
        'num_leaves': 2**5 - 1,
        'objective': 'binary',
        'max_depth': 8,
        'min_data_in_leaf': 50,
        'learning_rate': 0.05,
        'feature_fraction': 0.75,
        'bagging_fraction': 0.75,
        'bagging_freq': 1,
        'metric': 'binary',
        'num_threads': 4
    }

    dtrain = lgb.Dataset(
        X_train, label=y_train,
        categorical_feature=[],
    )
    dval = lgb.Dataset(
        X_val, label=y_val, reference=dtrain,
        categorical_feature=[])
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=100, verbose_eval=50
    )
    
    predicted = bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS)
    predicted = (predicted > 0.5).astype(np.int32)
    f_scores.append(f1_score(y_val, predicted.astype(np.bool)))

In [None]:
print('Val F1: ', np.mean(f_scores))