In [None]:
!pip install scikit-learn joblib

In [None]:
# Base
from typing import Tuple
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Scoring
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Saving
from joblib import dump, load

In [None]:
#data = pd.read_parquet('../data/processed/train_with_features_lightweight_10.parquet')
data = pd.read_parquet('../data/processed/train_with_features.parquet')

In [None]:
data.head()

## Hyperparams

In [None]:
data.columns

In [None]:
RANDOM_STATE = 42
features = ['anglez', 'enmo', 'hour', 'anglez_abs', 'anglez_diff', 'enmo_diff', 'anglez_x_enmo', 
            'anglez_rolling_mean', 'enmo_rolling_mean', 'anglez_rolling_max',
            'enmo_rolling_max', 'anglez_rolling_min', 'enmo_rolling_min',
            'anglez_rolling_std', 'enmo_rolling_std']
target_column = ['awake']

## Test, Train Split

In [None]:
def group_test_train_split(samples: pd.DataFrame, group: str, test_size, random_state: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    groups = samples[group].drop_duplicates()
    groups_train, groups_test = train_test_split(groups, test_size=test_size, random_state=random_state)

    samples_test = samples.loc[lambda d: d[group].isin(groups_test)]
    samples_train = samples.loc[lambda d: d[group].isin(groups_train)]

    return samples_test, samples_train

In [None]:
test, train = group_test_train_split(data, 'series_id', test_size=0.2, random_state=RANDOM_STATE)

X_train = train
y_train = train[target_column]
X_test = test
y_test = test[target_column]

In [None]:
del data
del train
del X_train
del y_train

In [None]:
import gc
gc.collect()

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Pipeline

In [None]:
kn_classifier = Pipeline([
        ('scale', StandardScaler()),
        ('knc', KNeighborsClassifier())])

dt_classifier = Pipeline([
    ('scale', StandardScaler()),
    ('dtc', DecisionTreeClassifier())])

## Classification

In [None]:
dt_classifier.fit(X_train[features], y_train)

## Save and load model

In [None]:
# saving classifier
dump(dt_classifier, 'dt_classifier.joblib')

In [None]:
del dt_classifier

In [None]:
# loading classifier
dt_classifier = load('dt_classifier.joblib')

## Prediction

In [None]:
y_pred = dt_classifier.predict(X_test[features])
y_pred_proba = dt_classifier.predict_proba(X_test[features])

In [None]:
print(y_test.shape)
print(y_pred.shape)
print(y_pred_proba.shape)

## Scores

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

## Get the Events from the predictions without smoothing

In [None]:
def get_events(test_series, classifier) :
    """
    Takes a time series and a classifier and returns a formatted submission dataframe.
    """
    
    series_ids = test_series['series_id'].unique()[:1]
    events = []

    for idx in series_ids: 

        # Collecting sample and normalizing features
        X = test_series[test_series.series_id == idx]
        
        # Applying classifier to get predictions and scores
        preds, probs = classifier.predict(X[features]), classifier.predict_proba(X[features])[:, 1]

        X['prediction'] = preds
        X['probability'] = probs
        
        # Getting predicted onset and wakeup time steps
        pred_onsets = X[X['prediction'].diff() > 0]['step'].tolist()
        pred_wakeups = X[X['prediction'].diff() < 0]['step'].tolist()
     
        if len(pred_onsets) > 0 : 
            
            # Ensuring all predicted sleep periods begin and end
            if min(pred_wakeups) < min(pred_onsets) : 
                pred_wakeups = pred_wakeups[1:]

            if max(pred_onsets) > max(pred_wakeups) :
                pred_onsets = pred_onsets[:-1]

            # Keeping sleep periods longer than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods :
                # Scoring using mean probability over period
                score = X[(X['step'] >= onset) & (X['step'] <= wakeup)]['probability'].mean()

                # Adding sleep event to dataframe
                onset_row = {'row_id': len(events), 'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}                
                events.append(onset_row)

                wakeup_row = {'row_id': len(events), 'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}
                events.append(wakeup_row)

    return pd.DataFrame(events)

In [None]:
submissions = get_events(X_test, dt_classifier)
submissions.to_csv('../results/submission_no_smoothing.csv', sep=',', index=False)

## Get the Events from the predictions with smoothing

In [None]:
def get_events_smoothed(test_series, classifier) :
    """
    Takes a time series and a classifier and returns a formatted submission dataframe.
    """
    
    series_ids = test_series['series_id'].unique()[1:2]
    events = []

    for idx in series_ids: 

        # Collecting sample and normalizing features
        X = test_series[test_series.series_id == idx]
        
        # Applying classifier to get predictions and scores
        not_awake, awake = classifier.predict_proba(X[features])[:, 0], classifier.predict_proba(X[features])[:, 1]

        X['not_awake'] = not_awake
        X['awake'] = awake
        
        smoothing_length = 12 * 30 # 30 Minutes
        X["score"] = X["awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")
        X["smooth"] = X["not_awake"].rolling(smoothing_length, center=True).mean().fillna(method="bfill").fillna(method="ffill")

        # Binarize the smoothing column
        X["smooth"] = X["smooth"].round()

        # Getting predicted onset and wakeup time steps
        pred_onsets = X[X['smooth'].diff() > 0]['step'].tolist()
        pred_wakeups = X[X['smooth'].diff() < 0]['step'].tolist()
     
        if len(pred_onsets) > 0 : 

            # Ensuring all predicted sleep periods begin and end
            if min(pred_wakeups) < min(pred_onsets) : 
                pred_wakeups = pred_wakeups[1:]

            if max(pred_onsets) > max(pred_wakeups) :
                pred_onsets = pred_onsets[:-1]

            # Keeping sleep periods longer than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods :
                # Scoring using mean probability over period
                score = X[(X['step'] >= onset) & (X['step'] <= wakeup)]['score'].mean()

                # Adding sleep event to dataframe
                onset_row = {'row_id': len(events), 'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}                
                events.append(onset_row)

                wakeup_row = {'row_id': len(events), 'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}
                events.append(wakeup_row)

    return pd.DataFrame(events)

In [None]:
submissions = get_events_smoothed(X_test, dt_classifier)
submissions.head()

In [None]:
X_test['series_id'].unique()[1:2]

In [None]:
submissions.to_csv('../results/submission_smoothing.csv', sep=',', index=False)

## Visualize Events

In [None]:
import plotly.express as px

In [None]:
participant_series_id = X_test.series_id.unique()[1]

In [None]:
participant_series = test[test.series_id == participant_series_id].copy()

In [None]:
fig = px.line(participant_series, x='step', y='anglez', title=f'Schlafdaten für {participant_series_id}')

for index, row in submissions.iterrows():
    color = 'green' if row.event == 'onset' else 'red'
    fig.add_vline(x=row.step, line_width=3, line_dash='dash', line_color=color)

fig.show()

## Visualize Awake/Asleep status

In [None]:
visuals = X_test.copy()
visuals['pred_awake'] = y_pred

In [None]:
visuals["pred_awake"] = visuals["pred_awake"].rolling(12*30, center=True).mean().fillna(method="bfill").fillna(method="ffill")

# Binarize the smoothing column
visuals["pred_awake"] = visuals["pred_awake"].round()

In [None]:
fig = px.line(visuals[visuals.series_id == participant_series_id][:90000].rename(columns={"awake": "Participant Awake", "pred_awake": "Prediction Awake"}), x='timestamp' ,y=['Participant Awake', 'Prediction Awake'])

In [None]:
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1],
        ticktext = ['Asleep ', 'Awake '],
        title = 'State'
    ),
    xaxis = dict(
        title = 'Timestamp'
    )
)

In [None]:
fig.show()