In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from utils import *
from sequence_utils import *

In [2]:
train = pd.read_csv("../data/train.csv")
print(train.columns)
print(train.shape)

Index(['acoustic_data', 'time_to_failure'], dtype='object')
(629145480, 2)


In [113]:
N_SEQ = 15000
N_TRAIN = 41000
NULL_OFF = 4.5

slices = random_sub_sequence_indexes(train['acoustic_data'], N_SEQ, n=N_TRAIN)
features, target = zip(*[subsequence_to_sample(train[sl]) for sl in slices])

In [114]:
sorted_slices = list(sorted(slices))
len(set(list(map(str, sorted_slices))))

41000

In [137]:
def generate_sequence_features(seq, freq_windows, vari_windows):
    df = pd.DataFrame({
        **{
            f"freq_{w}": extract_rolling_frequency(seq, n=w)
            for w in freq_windows
        },
        **{
            f"vari_{w}": extract_rolling_variance(seq, n=w)
            for w in vari_windows
        }
    })[max(*freq_windows, *vari_windows):]
    df = df.aggregate([
        'mean',
        'median',
        'std',
        'max',
        'min'
    ]).reset_index().melt(id_vars='index')
    df = df.set_index(df['index'] + '_' + df['variable']).drop(['index', 'variable'], axis=1)
    return df.transpose().reset_index(drop=True)

In [138]:
FREQ_WINDOWS = [100, 1000]
VARI_WINDOWS = [1000]

df = pd.concat([
    generate_sequence_features(
        sequence - NULL_OFF,
        freq_windows=FREQ_WINDOWS,
        vari_windows=VARI_WINDOWS
    ) for sequence in features 
])
df['target'] = target
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,mean_freq_100,median_freq_100,std_freq_100,max_freq_100,min_freq_100,mean_freq_1000,median_freq_1000,std_freq_1000,max_freq_1000,min_freq_1000,mean_vari_1000,median_vari_1000,std_vari_1000,max_vari_1000,min_vari_1000,target
0,0.364731,0.36,0.052343,0.53,0.15,0.367543,0.369,0.021414,0.418,0.317,2206.653,2168.0,136.074215,2656.0,1949.0,8.306198
1,0.322138,0.32,0.080694,0.56,0.11,0.31785,0.314,0.051102,0.396,0.184,3055.401571,2621.0,1440.290113,8717.0,2098.0,4.160896
2,0.144026,0.13,0.062585,0.44,0.04,0.139108,0.132,0.049571,0.274,0.072,25704.206357,11892.5,40389.926357,188818.0,3060.0,1.084297
3,0.350241,0.36,0.067982,0.54,0.12,0.34609,0.359,0.047831,0.414,0.199,2499.284143,2329.0,445.045626,4222.0,2068.0,7.690799
4,0.284365,0.3,0.085345,0.49,0.09,0.282258,0.284,0.059642,0.375,0.141,3884.156214,3006.0,2327.647947,12661.0,2204.0,0.655198


In [139]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

metric_funcs = {
    "mse": mean_squared_error,
    "mae": mean_absolute_error
}

def cross_validate(Model, params, data, t_col = "target"):
    log = []
    for i, (tr_i, t_i) in enumerate(KFold(n_splits=5).split(data)):
        X_tr, y_tr = data.drop(t_col, axis=1).iloc[tr_i], data[t_col].iloc[tr_i]
        X_t, y_t = data.drop(t_col, axis=1).iloc[t_i], data[t_col].iloc[t_i]
        model = Model(**params)
        model.fit(X_tr, y_tr)
        pred = model.predict(X_t)
        log.append({
            **{n: f(y_t, pred) for n, f in metric_funcs.items()},
            **params
        })
        
    log_df = pd.DataFrame(log).mean().to_frame().transpose()
    for k, v in params.items():
        log_df[k] = str(v)
        
    return log_df

In [140]:
from xgboost import XGBRegressor

cross_validate(XGBRegressor, {}, df)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


Unnamed: 0,mae,mse
0,2.397724,9.152096
