In [None]:
import pandas as pd
import gc
from joblib import load

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.tree import DecisionTreeClassifier

# Disable warnings
pd.options.mode.chained_assignment = None

In [None]:
# Hyperparams
NUM_STEPS = 20

## Feature Engineering

In [None]:
def make_features_chunk(series_id, periods=NUM_STEPS):
    df = pd.read_parquet('../data/raw/train_series.parquet', filters=[('series_id','=',series_id)])
    
    print(df.shape)
    print('Generating time features')
    df["hour"] = df['timestamp'].str[11:13]
    
    print('Generating statistical features')
    df["anglez_abs"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).bfill().astype('float32')
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).bfill().astype('float32')
    df['anglez_x_enmo'] = df['anglez'] * df['enmo']
    
    print('Generating rolling features')
    df["anglez_rolling_mean"] = df["anglez"].rolling(periods,center=True).mean().bfill().ffill().astype('float32')
    df["enmo_rolling_mean"] = df["enmo"].rolling(periods,center=True).mean().bfill().ffill().astype('float32')
    df["anglez_rolling_max"] = df["anglez"].rolling(periods,center=True).max().bfill().ffill().astype('float32')
    df["enmo_rolling_max"] = df["enmo"].rolling(periods,center=True).max().bfill().ffill().astype('float32')
    df["anglez_rolling_min"] = df["anglez"].rolling(periods,center=True).min().bfill().ffill().astype('float32')
    df["enmo_rolling_min"] = df["enmo"].rolling(periods,center=True).min().bfill().ffill().astype('float32')
    df["anglez_rolling_std"] = df["anglez"].rolling(periods,center=True).std().bfill().ffill().astype('float32')
    df["enmo_rolling_std"] = df["enmo"].rolling(periods,center=True).std().bfill().ffill().astype('float32')    
    
    return df

In [None]:
df_test = pd.read_parquet('../data/raw/test_series.parquet')
#df_test = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')

In [None]:
def make_features(test):
    train_data = []

    total_len = test.series_id.nunique()
    series_ids = test.series_id.unique()
    
    del test
    gc.collect()

    for i, series_id in enumerate(series_ids):
        print(f'Step {i+1} of {total_len}')
        chunk = make_features_chunk(series_id)
        train_data.append(chunk)
        del chunk
        gc.collect()

    return pd.concat(train_data).reset_index(drop=True)

In [None]:
test_with_features = make_features(df_test)

In [None]:
del df_test
gc.collect()

In [None]:
features = ['anglez', 'enmo', 'hour', 'anglez_abs', 'anglez_diff', 'enmo_diff', 'anglez_x_enmo', 
            'anglez_rolling_mean', 'enmo_rolling_mean', 'anglez_rolling_max',
            'enmo_rolling_max', 'anglez_rolling_min', 'enmo_rolling_min',
            'anglez_rolling_std', 'enmo_rolling_std']

In [None]:
# loading classifier
dt_classifier = load('dt_classifier.joblib')

## Get the Events from the predictions with smoothing

In [None]:
def get_events(test_series, classifier) :
    """
    Takes a time series and a classifier and returns a formatted submission dataframe.
    """
    
    series_ids = test_series['series_id'].unique()
    events = []

    for idx in series_ids: 

        # Collecting sample and normalizing features
        X = test_series[test_series.series_id == idx]
        
        # Applying classifier to get predictions and scores
        not_awake, awake = classifier.predict_proba(X[features])[:, 0], classifier.predict_proba(X[features])[:, 1]

        X['not_awake'] = not_awake
        X['awake'] = awake
        
        smoothing_length = 12 * 30 # 30 Minutes
        X["score"] = X["awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")
        X["smooth"] = X["not_awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")

        # Binarize the smoothing column
        X["smooth"] = X["smooth"].round()

        # Getting predicted onset and wakeup time steps
        pred_onsets = X[X['smooth'].diff() > 0]['step'].tolist()
        pred_wakeups = X[X['smooth'].diff() < 0]['step'].tolist()
     
        if len(pred_onsets) > 0: 

            # Ensuring all predicted sleep periods begin and end
            if min(pred_wakeups) < min(pred_onsets): 
                pred_wakeups = pred_wakeups[1:]

            if max(pred_onsets) > max(pred_wakeups):
                pred_onsets = pred_onsets[:-1]

            # Keeping sleep periods longer than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods:
                # Scoring using mean probability over period
                score = X[(X['step'] >= onset) & (X['step'] <= wakeup)]['score'].mean()

                # Adding sleep event to dataframe
                onset_row = {'row_id': len(events), 'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}                
                events.append(onset_row)

                wakeup_row = {'row_id': len(events), 'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}
                events.append(wakeup_row)

        indexToDrop = test_series[test_series.series_id == idx].index
        test_series.drop(indexToDrop, inplace=True)
        del X
        gc.collect()

    return pd.DataFrame(events)

In [None]:
submissions = get_events(test_with_features, dt_classifier)
submissions.to_csv('submission.csv', sep=',', index=False)