In [26]:
import pandas as pd
import lightgbm as lgb
from datetime import datetime, timedelta
import tqdm
import numpy as np
from sklearn.metrics import f1_score
np.random.seed(4242)

In [27]:
import sys
sys.executable

'/home/root_shannon/anaconda3/bin/python3'

In [28]:
def parse_dt(s):
    return datetime.strptime(s, '%Y-%m-%d %H:%M:%S')

df = pd.read_csv('data/task1/dataset.csv',  encoding='cp1251')
df = df.fillna(0)
df['date'] = df['date'].apply(parse_dt)

In [29]:
df_y = pd.read_csv('data/task1/stops.csv')
df_y.date = pd.to_datetime(df_y.date, dayfirst=True)

In [30]:
df_y

Unnamed: 0,date,type
0,2017-01-11 11:03:00,stop
1,2017-01-30 08:51:00,stop
2,2017-02-24 17:45:00,stop
3,2017-03-07 00:16:00,stop
4,2017-03-08 11:23:00,vibrosito
5,2017-03-08 15:56:00,vibrosito
6,2017-03-10 17:55:00,stop
7,2017-03-31 01:42:00,vibrosito
8,2017-04-01 07:45:00,stop
9,2017-04-03 20:48:00,stop


In [31]:
def make_range(df, stop_dt):
    return (df['date'] >= (stop_dt - timedelta(hours=1))) & (df['date'] < stop_dt)

df_no_stops = df.copy()
for stop_dt in tqdm.tqdm(df_y['date']):
    df_no_stops = df_no_stops[~make_range(df_no_stops, stop_dt)]

100%|██████████| 66/66 [01:22<00:00,  1.24s/it]


In [32]:
def make_features(df):
    vecs = []
    columns = df.columns[df.columns != 'date']
    for period in [1, 5, 10, 20, 30, 100, 360]:
        mean = df[-period:][columns].mean(axis=0).values
        min = df[-period:][columns].min(axis=0).values
        max = df[-period:][columns].max(axis=0).values
        std = df[-period:][columns].std(axis=0).values
        median = df[-period:][columns].median(axis=0).values
        feats = np.hstack([mean, min, max, std, median])
        vecs.append(feats)
    
    return np.hstack(vecs)

def make_false(df_no_stops):
    random_work = np.random.choice(df_no_stops.shape[0], 10000)
    xs = []
    ys = []
    
    for work in tqdm.tqdm(random_work):
        rng = df_no_stops[make_range(df_no_stops, df_no_stops.iloc[work]['date'])]
        feature_vector = make_features(rng)
        xs.append(feature_vector)
        ys.append(0)
    
    return np.array(xs), np.array(ys)

X_false, y_false = make_false(df_no_stops)

100%|██████████| 10000/10000 [13:45<00:00, 12.11it/s]


In [42]:
def make_true(df_with_stops, df_y):
    xs = []
    ys = []    
    for stop_dt in df_y['date'][:-1]: # the last date is out of df range
        rng = df_with_stops[make_range(df_with_stops, stop_dt)]
        feature_vector = make_features(rng)
        print(feature_vector)
        print(best_model.predict([feature_vector]))
        xs.append(feature_vector)
        ys.append(1)
    return np.array(xs), np.array(ys)

X_true, y_true = make_true(df, df_y)

[5.09999990e+00 1.04699993e+01 1.02399998e+01 ... 5.25000000e+03
 3.26700012e+02 4.30392575e+00]
[0.65829995]
[5.00000000e+00 9.34999943e+00 9.31999969e+00 ... 5.43000000e+03
 3.43000000e+02 3.33658838e+00]
[0.57164863]
[5.00000000e+00 9.01000023e+00 8.98999977e+00 ... 5.60000000e+03
 3.34200012e+02 3.40390372e+00]
[0.63657334]
[5.00000000e+00 6.87999964e+00 7.03999996e+00 ... 5.53000000e+03
 3.32100006e+02 3.87335348e+00]
[0.84096998]
[5.00000000e+00 9.68999958e+00 9.72999954e+00 ... 5.58000000e+03
 3.33000000e+02 3.84533906e+00]
[0.84660551]
[4.90000010e+00 8.72000027e+00 8.81999969e+00 ... 5.54000000e+03
 3.30000000e+02 3.90544033e+00]
[0.84181218]
[5.00000000e+00 9.23999977e+00 9.30000019e+00 ... 5.53000000e+03
 3.34800018e+02 3.89195299e+00]
[0.80626408]
[4.90000010e+00 8.34999943e+00 8.53999996e+00 ... 5.64000000e+03
 3.35399994e+02 4.37435389e+00]
[0.81102615]
[4.90000010e+00 7.39999962e+00 7.62999964e+00 ... 5.56000000e+03
 3.34500000e+02 4.79094267e+00]
[0.60777903]
[5.0000000

In [34]:
X = np.vstack([X_false, X_true])
y = np.hstack([y_false, y_true]).T

In [35]:
def make_train_val(X, y):
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=3)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        yield X_train, y_train, X_test, y_test

In [36]:
f_scores = []
best_model = None
best_score = None
for X_train, y_train, X_val, y_val in make_train_val(X, y):
    MAX_ROUNDS = 1000
    params = {
        'num_leaves': 2**5 - 1,
        'objective': 'binary',
        'max_depth': 8,
        'min_data_in_leaf': 50,
        'learning_rate': 0.05,
        'feature_fraction': 0.75,
        'bagging_fraction': 0.75,
        'bagging_freq': 1,
        'metric': 'binary',
        'num_threads': 4
    }

    dtrain = lgb.Dataset(
        X_train, label=y_train,
        categorical_feature=[],
    )
    dval = lgb.Dataset(
        X_val, label=y_val, reference=dtrain,
        categorical_feature=[])
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=100, verbose_eval=50
    )
    
    predicted = bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS)
    predicted = (predicted > 0.5).astype(np.int32)
    score = f1_score(y_val, predicted.astype(np.bool))
    f_scores.append(score)
    if not best_score or score > score:
        best_score = score
        best_model = bst
        



Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0528611	valid_1's binary_logloss: 0.0617174
[100]	training's binary_logloss: 0.00823029	valid_1's binary_logloss: 0.0328169
[150]	training's binary_logloss: 0.00154434	valid_1's binary_logloss: 0.0399379
Early stopping, best iteration is:
[99]	training's binary_logloss: 0.0085145	valid_1's binary_logloss: 0.0327629
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.0554304	valid_1's binary_logloss: 0.059394
[100]	training's binary_logloss: 0.00967979	valid_1's binary_logloss: 0.0229007
[150]	training's binary_logloss: 0.00188041	valid_1's binary_logloss: 0.0221189
[200]	training's binary_logloss: 0.000508972	valid_1's binary_logloss: 0.0274896
Early stopping, best iteration is:
[123]	training's binary_logloss: 0.00445163	valid_1's binary_logloss: 0.0209989
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logl

In [37]:
print('Val F1: ', np.mean(f_scores))

Val F1:  0.05808080808080809


In [38]:
from sklearn.externals import joblib
joblib.dump(best_model, 'model.pkl')

['model.pkl']