# Handcrafted models
This notebook explores how far you can get with a simple model using good features.
Features are handpicked based on feature_engineerging_v2 and features_tsfresh.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import gc
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm

## Load data with selected features

In [2]:
features_long_timescale = [
    'abs_diff_clip_anglez_skew_1000', 'abs_diff_clip_anglez_mean_1000', 'abs_diff_clip_anglez_kurt_1000', 'abs_diff_anglez_kurt_1000', 'abs_diff_clip_anglez_skew_360', 'abs_diff_clip_anglez_median_360', 'enmo_mean_1000', 'enmo_std_360', 'abs_diff_clip_anglez_std_360', 'enmo_mean_360', 'enmo_max_1000', 'hour_stat', 'abs_diff_clip_anglez_median_12', 'abs_diff_clip_anglez_min_12', 'abs_diff_anglez_skew_360', 'enmo_std_12', 'abs_diff_anglez_std_360', 'abs_diff_clip_anglez', 'abs_diff_anglez', 'enmo_median_1000', 'sin_anglez_std_12', 'abs_diff_clip_anglez_max_12', 'abs_diff_anglez_max_12', 'enmo_mean_12', 'abs_diff_clip_anglez_min_360', 'enmo', 'hour', 'abs_diff_clip_anglez_skew_12', 'abs_diff_clip_anglez_min_1000', 'enmo_skew_12', 'sin_anglez_min_360', 'enmo_min_12', 'sin_anglez_min_1000', 'sin_anglez_std_1000', 'abs_diff_anglez_max_360', 'enmo_kurt_360', 'abs_diff_anglez_kurt_12', 'sin_anglez_min_12', 'enmo_skew_360', 'enmo_kurt_1000', 'sin_anglez_max_12', 'sin_anglez_max_360', 'abs_diff_anglez_skew_12', 'abs_diff_clip_anglez_max_360', 'abs_diff_anglez_max_1000', 'sin_anglez_std_360', 'sin_anglez_skew_12', 'sin_anglez_median_1000', 'enmo_kurt_12', 'sin_anglez_median_360', 'sin_anglez_max_1000', 'sin_anglez_mean_12', 'abs_diff_clip_anglez_max_1000', 'abs_diff_anglez_kurt_360', 'sin_anglez_skew_1000', 'minute', 'sin_anglez_kurt_360', 'enmo_min_1000', 'enmo_min_360', 'sin_anglez_skew_360', 'sin_anglez_kurt_1000', 'abs_diff_clip_anglez_kurt_12'
] 

features_short_timescale = [
    'abs_diff_clip_anglez_mean_12', 'abs_diff_anglez_max_12', 'abs_diff_clip_anglez_max_12', 'enmo_std_12', 'abs_diff_clip_anglez_median_360', 'abs_diff_anglez_median_12', 'abs_diff_clip_anglez_mean_360', 'abs_diff_clip_anglez_skew_360', 'abs_diff_anglez', 'abs_diff_clip_anglez_kurt_360', 'enmo_mean_12', 'abs_diff_anglez_median_1000', 'abs_diff_clip_anglez_min_12', 'abs_diff_anglez_kurt_360', 'enmo', 'abs_diff_anglez_std_360', 'enmo_mean_360', 'enmo_skew_12', 'enmo_median_360', 'abs_diff_clip_anglez_mean_1000', 'enmo_std_360', 'sin_anglez_kurt_360', 'abs_diff_clip_anglez_skew_1000', 'abs_diff_clip_anglez_kurt_1000', 'enmo_skew_360', 'abs_diff_anglez_kurt_12', 'sin_anglez_min_12', 'enmo_max_360', 'abs_diff_anglez_kurt_1000', 'abs_diff_clip_anglez_skew_12', 'enmo_median_1000', 'sin_anglez_std_360', 'sin_anglez_max_12', 'abs_diff_anglez_std_1000', 'enmo_mean_1000', 'abs_diff_anglez_max_360', 'enmo_min_12', 'sin_anglez_min_360', 'sin_anglez_max_360', 'enmo_std_1000', 'enmo_max_1000', 'sin_anglez_kurt_1000', 'hour_stat', 'enmo_skew_1000', 'enmo_min_360', 'sin_anglez', 'hour', 'sin_anglez_skew_12', 'sin_anglez_min_1000', 'sin_anglez_max_1000', 'abs_diff_clip_anglez_kurt_12', 'sin_anglez_median_360', 'abs_diff_anglez_max_1000', 'sin_anglez_std_1000', 'sin_anglez_median_1000', 'abs_diff_clip_anglez_max_360', 'enmo_min_1000', 'abs_diff_anglez_min_1000', 'abs_diff_anglez_min_360', 'sin_anglez_skew_1000', 'minute', 'sin_anglez_skew_360', 'enmo_kurt_12'
]

In [3]:
info_columns = ['series_id', 'target']
my_selection = [
    'abs_diff_clip_anglez_skew_1000', # best auroc for long timescale
    'abs_diff_clip_anglez_mean_1000', 
    'enmo_mean_1000',
    
    'abs_diff_clip_anglez_mean_12', # best auroc for short timescale    
    'abs_diff_clip_anglez_median_360', # most important feature for short timescale
    'sin_anglez_median_360', # 2nd most important feature for short timescale
    'abs_diff_anglez_max_12',
    'enmo_std_12',
    
    # others
    'hour_stat',
    'minute'
]

In [4]:
data = pd.read_parquet('../../data/processed/Zzzs_train_features.parquet', columns=info_columns + my_selection)

In [5]:
# convert each fp64 column to float32
cols_64 = data.select_dtypes('float64').columns
for col in cols_64:
    data[col] = data[col].astype('float32')
gc.collect()

630

In [6]:
data.head(10)

Unnamed: 0,series_id,target,abs_diff_clip_anglez_skew_1000,abs_diff_clip_anglez_mean_1000,enmo_mean_1000,abs_diff_clip_anglez_mean_12,abs_diff_clip_anglez_median_360,sin_anglez_median_360,abs_diff_anglez_max_12,enmo_std_12,hour_stat,minute
0,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
1,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
2,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
3,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
4,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
5,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
6,08db4255286f,1,1.360219,1.858507,0.075354,1.387191,1.451798,-0.463476,4.412199,0.019242,0.959695,0
7,08db4255286f,1,1.360219,1.858507,0.075354,1.573966,1.451798,-0.463476,4.412199,0.018462,0.959695,0
8,08db4255286f,1,1.360219,1.858507,0.075354,1.361925,1.451798,-0.463476,4.412199,0.017542,0.959695,0
9,08db4255286f,1,1.360219,1.858507,0.075354,1.451058,1.451798,-0.463476,4.412199,0.017658,0.959695,0


## Transform features with a logistic regression prediction

In [7]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

transformed_features = []
for feature in tqdm(my_selection):
    x = data[[feature]]
    y = data.target
    
    clf = LogisticRegression(random_state=0)
    clf.fit(x, y)
    
    preds = clf.predict_proba(x)[:, 1]
    auroc = roc_auc_score(y, preds)
    print(f'{feature}: {auroc}')
    if auroc > 0.7:
        data[feature] = preds
        transformed_features.append(feature)
    gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

abs_diff_clip_anglez_skew_1000: 0.9897042305979566
abs_diff_clip_anglez_mean_1000: 0.9897906749624402
enmo_mean_1000: 0.9683874572226683
abs_diff_clip_anglez_mean_12: 0.9535630054410761
abs_diff_clip_anglez_median_360: 0.9784204796734574
sin_anglez_median_360: 0.5428605793292275
abs_diff_anglez_max_12: 0.9131536866568979
enmo_std_12: 0.9310667914137047
hour_stat: 0.9538928677542042
minute: 0.5000164907065726


## Plot the transformed features and the target

In [33]:
# plot with plotly
display_days = 5
subsample = 10
display_set = data[:display_days*17280:subsample]
fig = px.line(display_set, x=range(len(display_set)), y=transformed_features+['target'])
fig.show()

# Train a model on the transformed features

In [9]:
# make split
X = data[my_selection]
y = data.target

test_size = 0.2
sids = data.series_id.unique()
train_sids = sids[:int(len(sids)*(1-test_size))]
train_mask = data.series_id.isin(train_sids)

X_train = X[train_mask]
y_train = y[train_mask]
X_test = X[~train_mask]
y_test = y[~train_mask]

In [22]:
gc.collect()

0

In [24]:
# train a catboost model with eval set and early stopping
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

clf = CatBoostClassifier(iterations=1000, eval_metric='AUC', early_stopping_rounds=5, verbose=True)
clf.fit(X_train, y_train, eval_set=(X_test, y_test))

preds = clf.predict_proba(X_test)[:, 1]
auroc = roc_auc_score(y_test, preds)
print(f'AUROC: {auroc}')

Learning rate set to 0.313191
0:	test: 0.9937026	best: 0.9937026 (0)	total: 1.16s	remaining: 19m 19s
1:	test: 0.9948688	best: 0.9948688 (1)	total: 2.27s	remaining: 18m 54s
2:	test: 0.9951924	best: 0.9951924 (2)	total: 3.46s	remaining: 19m 11s
3:	test: 0.9953882	best: 0.9953882 (3)	total: 4.57s	remaining: 18m 57s
4:	test: 0.9953716	best: 0.9953882 (3)	total: 5.57s	remaining: 18m 28s
5:	test: 0.9954133	best: 0.9954133 (5)	total: 6.7s	remaining: 18m 29s
6:	test: 0.9955650	best: 0.9955650 (6)	total: 7.82s	remaining: 18m 29s
7:	test: 0.9957070	best: 0.9957070 (7)	total: 8.92s	remaining: 18m 26s
8:	test: 0.9958486	best: 0.9958486 (8)	total: 9.96s	remaining: 18m 16s
9:	test: 0.9958567	best: 0.9958567 (9)	total: 11.1s	remaining: 18m 18s
10:	test: 0.9957277	best: 0.9958567 (9)	total: 12.2s	remaining: 18m 13s
11:	test: 0.9957504	best: 0.9958567 (9)	total: 13.2s	remaining: 18m 5s
12:	test: 0.9957221	best: 0.9958567 (9)	total: 14.2s	remaining: 17m 56s
13:	test: 0.9957469	best: 0.9958567 (9)	total:

# Plot predictions

In [32]:
# plot with plotly detailed
display_days = 5
subsample = 1
df = pd.DataFrame({'preds': preds, 'target': y_test})
df_display = df[:display_days*17280:subsample]
fig = px.line(df_display, x=range(len(df_display)), y=['preds', 'target'])
fig.write_html('handcrafted_model_viz_detailed.html')

# plot overview
display_days = 100
subsample = 90
df = pd.DataFrame({'preds': preds, 'target': y_test})
df_display = df[:display_days*17280:subsample]
fig = px.line(df_display, x=range(len(df_display)), y=['preds', 'target'])
fig.write_html('handcrafted_model_viz_overview.html')