In [35]:
import pandas as pd
import gc
from joblib import load

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.tree import DecisionTreeClassifier

# Disable warnings
pd.options.mode.chained_assignment = None

In [None]:
# Hyperparams
NUM_STEPS = 20

features = ['anglez', 'enmo', 'hour', 'anglez_abs', 'anglez_diff', 'enmo_diff', 'anglez_x_enmo', 
            'anglez_rolling_mean', 'enmo_rolling_mean', 'anglez_rolling_max',
            'enmo_rolling_max', 'anglez_rolling_min', 'enmo_rolling_min',
            'anglez_rolling_std', 'enmo_rolling_std']

## Feature Engineering

In [None]:
def make_features(series_id, periods=NUM_STEPS):
    df = pd.read_parquet('../data/raw/train_series.parquet', filters=[('series_id','=',series_id)])
    #df = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet', filters=[('series_id','=',series_id)])
    
    print('Generating time features')
    df["hour"] = df['timestamp'].str[11:13]
    
    print('Generating statistical features')
    df["anglez_abs"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).bfill().astype('float32')
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).bfill().astype('float32')
    df['anglez_x_enmo'] = df['anglez'] * df['enmo']
    
    print('Generating rolling features')
    df["anglez_rolling_mean"] = df["anglez"].rolling(periods,center=True).mean().bfill().ffill().astype('float32')
    df["enmo_rolling_mean"] = df["enmo"].rolling(periods,center=True).mean().bfill().ffill().astype('float32')
    df["anglez_rolling_max"] = df["anglez"].rolling(periods,center=True).max().bfill().ffill().astype('float32')
    df["enmo_rolling_max"] = df["enmo"].rolling(periods,center=True).max().bfill().ffill().astype('float32')
    df["anglez_rolling_min"] = df["anglez"].rolling(periods,center=True).min().bfill().ffill().astype('float32')
    df["enmo_rolling_min"] = df["enmo"].rolling(periods,center=True).min().bfill().ffill().astype('float32')
    df["anglez_rolling_std"] = df["anglez"].rolling(periods,center=True).std().bfill().ffill().astype('float32')
    df["enmo_rolling_std"] = df["enmo"].rolling(periods,center=True).std().bfill().ffill().astype('float32')    
    
    return df

In [50]:
df_test = pd.read_parquet('../data/raw/train_series.parquet', columns=['series_id'])
#df_test = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet', columns=['series_id'])

series_ids = df_test.series_id.unique()[:5]
del df_test
gc.collect()

2030

In [None]:
# loading classifier
dt_classifier = load('dt_classifier.joblib')

## Get the Events from the predictions with smoothing

In [45]:
def get_events(test_series, classifier, smoothing_minutes) :
    """
    Takes a time series and a classifier and returns a formatted submission dataframe.
    """
    
    series_ids = test_series['series_id'].unique()
    events = []

    for idx in series_ids: 

        # Collecting sample and normalizing features
        X = test_series[test_series.series_id == idx]
        
        # Applying classifier to get predictions and scores
        not_awake, awake = classifier.predict_proba(X[features])[:, 0], classifier.predict_proba(X[features])[:, 1]

        X['not_awake'] = not_awake
        X['awake'] = awake
        
        smoothing_length = 12 * smoothing_minutes
        X["score"] = X["awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")
        X["smooth"] = X["not_awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")

        # Binarize the smoothing column
        X["smooth"] = X["smooth"].round()

        # Getting predicted onset and wakeup time steps
        pred_onsets = X[X['smooth'].diff() > 0]['step'].tolist()
        pred_wakeups = X[X['smooth'].diff() < 0]['step'].tolist()
     
        if len(pred_onsets) > 0: 

            # Ensuring all predicted sleep periods begin and end
            if min(pred_wakeups) < min(pred_onsets): 
                pred_wakeups = pred_wakeups[1:]

            if max(pred_onsets) > max(pred_wakeups):
                pred_onsets = pred_onsets[:-1]

            # Keeping sleep periods longer than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods:
                # Scoring using mean probability over period
                score = X[(X['step'] >= onset) & (X['step'] <= wakeup)]['score'].mean()

                # Adding sleep event to dataframe
                onset_row = {'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}                
                events.append(onset_row)

                wakeup_row = {'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}
                events.append(wakeup_row)

    return pd.DataFrame(events)

In [46]:
def get_submissions(series_ids, classifier, smoothing_minutes):
    events = []

    total_len = len(series_ids)

    for i, series_id in enumerate(series_ids):
        print(f'Step {i+1} of {total_len}')
        series_with_features = make_features(series_id)
        events.append(get_events(series_with_features, classifier, smoothing_minutes))
        
        del series_with_features
        
        gc.collect()

    return pd.concat(events).reset_index(drop=True)

In [47]:
submissions = get_submissions(series_ids, dt_classifier, 30)
submissions.reset_index(inplace=True)
submissions.rename(columns={"index": "row_id"}, inplace=True)
submissions.to_csv('submission.csv', sep=',', index=False)

TypeError: get_submissions() missing 1 required positional argument: 'smoothing_minutes'

# Scoring

In [37]:
import sys
sys.path.append('../')

from scoring.event_detection_matrix import competition_score

In [53]:
all_events = pd.read_csv('../data/raw/train_events.csv')
series_ids = submissions['series_id'].drop_duplicates()
needed_events = all_events.loc[all_events['series_id'].isin(series_ids)]

In [55]:
scoring = []
minutes = [30, 45, 60]

for minute in minutes:
    submissions = get_submissions(series_ids, dt_classifier, minute)
    submissions.reset_index(inplace=True)
    submissions.rename(columns={"index": "row_id"}, inplace=True)
    score = competition_score(needed_events, submissions)
    scoring.append(score)  

for score, minute in zip(scoring, minutes):
    print(f'{minute} Minutes, score: {score}')

Step 1 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 2 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 3 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 4 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 5 of 5
Generating time features
Generating statistical features
Generating rolling features
Scoring for smoothing minutes at 30: 0.5411282637820242
Step 1 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 2 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 3 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 4 of 5
Generating time features
Generating statistical features
Generating rolling features
Step 5 of 5
Generating time features
Generating statistical features
Ge