# Datenanalyse

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

import gc

## Daten laden

In [None]:
train_events = pd.read_csv('../../data/raw/train_events.csv')
train_series = pd.read_parquet('../../data/raw/train_series.parquet')

test_series = pd.read_parquet('../../data/raw/test_series.parquet')
test_events = pd.read_csv('../../data/raw/sample_submission.csv')

In [None]:
train_events['date'] = train_events['timestamp'].str.split('T', expand=True)[0]
train_events['time'] = train_events['timestamp'].str.split('T', expand=True)[1].str.split('-', expand=True)[0]
train_events['timestamp'] = pd.to_datetime(train_events['date']+' '+train_events['time'])
train_events['hour'] = train_events['timestamp'].dt.hour

In [None]:
train_events.head()

In [None]:
train_series.head()

## Quantitative Analyse

In [None]:
print(f'Anzahl Zeitreihen: {len(train_series.series_id.unique())}')

In [None]:
print(f'Anzahl unbestimmter Werte in series_id: {train_series.series_id.isna().sum()}')
print(f'Anzahl unbestimmter Werte in step: {train_series.step.isna().sum()}')
print(f'Anzahl unbestimmter Werte in anglez: {train_series.anglez.isna().sum()}')
print(f'Anzahl unbestimmter Werte in enmo: {train_series.enmo.isna().sum()}')

In [None]:
print(f'Anzahl unbestimmter Werte in series_id: {train_events.series_id.isna().sum()}')
print(f'Anzahl unbestimmter Werte in night: {train_events.night.isna().sum()}')
print(f'Anzahl unbestimmter Werte in event: {train_events.event.isna().sum()}')
print(f'Anzahl unbestimmter Werte in timestamp: {train_events.timestamp.isna().sum()}')

In [None]:
series_has_NaN = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())

print('Anzahl Zeitreihen mit Unterbrüche: ', series_has_NaN.value_counts().iloc[0])
print('Anzahl Zeitreihen ohne Unterbrüche: ', series_has_NaN.value_counts().iloc[1])

In [None]:
print('Prüfe dass alle Serien mit einem "onset" Event starten: ')
print(train_events.groupby('series_id').head(1)["event"].unique())

In [None]:
print('Prüfe dass alle Serien mit einem "wakeup" Event enden: ')
print(train_events.groupby('series_id').tail(1)["event"].unique())

In [None]:
series_id_list = train_series['series_id'].unique().tolist()

train_events = train_events.dropna()
events_series_id_list = train_events['series_id'].unique().tolist()

series_without_events = list(set(series_id_list) - set(events_series_id_list))

print('Serien ohne Events: \n', series_without_events)

## `angelz`

z-angle is a metric derived from individual accelerometer components that is commonly used in sleep detection, and refers to the angle of the arm relative to the vertical axis of the body.

In [None]:
participant_series_id = '038441c925bb'

participant_series = train_series[train_series.series_id == participant_series_id].copy()
participant_events = train_events[train_events.series_id == participant_series_id].copy()

participant_series['date'] = participant_series['timestamp'].str.split('T', expand=True)[0]
participant_series['time'] = participant_series['timestamp'].str.split('T', expand=True)[1].str.split('-', expand=True)[0]
participant_series['timestamp'] = pd.to_datetime(participant_series['date']+' '+participant_series['time'])
participant_series['hour'] = participant_series['timestamp'].dt.hour

In [None]:
fig = px.line(participant_series, x='timestamp', y='anglez', title=f'Schlafdaten für {participant_series_id}')
for index, row in participant_events.dropna().iterrows():
    color = 'orange' if row.event == 'onset' else 'red' 
    fig.add_vline(x=row.timestamp, line_width=3, line_dash='dash', line_color=color)

#Custom Legend
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='orange', width=2, dash='dash'),
        name="onset event",
    )
)
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='red', width=2, dash='dash'),
        name="wakeup event",
    )
)
fig.update_layout(legend_title="Events")
    
fig.show()

### Moving Average

In [None]:
participant_series['anglez_mean'] = participant_series['anglez'].rolling(20, center=True).agg('mean').bfill().ffill().values

participant_series_vis = participant_series[0:15_000]
participant_events_vis = participant_events[0:2]

fig = go.Figure(
    data=[
        go.Scatter(x=participant_series_vis['timestamp'], y=participant_series_vis['anglez'], mode='lines', name='anglez'),
        go.Scatter(x=participant_series_vis['timestamp'], y=participant_series_vis['anglez_mean'], mode='lines', name='anglez moving average')
    ],
    layout=go.Layout(
        title=f'Schlafdaten für {participant_series_id}'
    )
)

for index, row in participant_events_vis.dropna().iterrows():
    color = 'orange' if row.event == 'onset' else 'red' 
    fig.add_vline(x=row.timestamp, line_width=3, line_dash='dash', line_color=color)

fig.show()

### Verteilung

In [None]:
participant_series['anglez'].value_counts().sort_index().plot(kind='area')

### Verteilung durch den Tag

In [None]:
participant_series.groupby('hour')['anglez'].mean().plot(kind='bar')

## `enmo`
ENMO is the Euclidean Norm Minus One of all accelerometer signals, with negative values rounded to zero. While no standard measure of acceleration exists in this space, this is one of the several commonly computed features.

In [None]:
fig = px.line(participant_series, x='timestamp', y='enmo', title=f'Schlafdaten für {participant_series_id}')
for index, row in participant_events.dropna().iterrows():
    color = 'orange' if row.event == 'onset' else 'red' 
    fig.add_vline(x=row.timestamp, line_width=3, line_dash='dash', line_color=color)

#Custom Legend
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='orange', width=2, dash='dash'),
        name="onset event",
    )
)
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='red', width=2, dash='dash'),
        name="wakeup event",
    )
)
fig.update_layout(legend_title="Events")
    
fig.show()

### Moving Average

In [None]:
participant_series['enmo_mean'] = participant_series['enmo'].rolling(20, center=True).agg('mean').bfill().ffill().values

participant_series_vis = participant_series[0:15_000]
participant_events_vis = participant_events[0:2]

fig = go.Figure(
    data=[
        go.Scatter(x=participant_series_vis['timestamp'], y=participant_series_vis['enmo'], mode='lines', name='enmo'),
        go.Scatter(x=participant_series_vis['timestamp'], y=participant_series_vis['enmo_mean'], mode='lines', name='enmo moving average')
    ],
    layout=go.Layout(
        title=f'Schlafdaten für {participant_series_id}'
    )
)

for index, row in participant_events_vis.dropna().iterrows():
    color = 'orange' if row.event == 'onset' else 'red' 
    fig.add_vline(x=row.timestamp, line_width=3, line_dash='dash', line_color=color)

fig.show()

### Verteilung

In [None]:
participant_series['enmo'].value_counts().sort_index().plot(kind='line')

### Verteilung druch den Tag

In [None]:
participant_series.groupby('hour')['enmo'].mean().plot(kind='bar')

## Korrelation zwischen `anglez` und `enmo`

In [None]:
corr = train_series[['anglez', 'enmo']].corr()
sns.heatmap(corr, cmap="Blues", annot=True)

## Länge der Zeitreihen

In [None]:
train_series['series_id'].value_counts().sort_values().plot(kind='area')

## `onset` Event Verteilung

In [None]:
onset_train_events = train_events[train_events.event == 'onset']
onset_train_events['hour'].dropna().value_counts().sort_index().plot(kind='bar')

## `wakeup` Event Verteilung

In [None]:
onset_train_events = train_events[train_events.event == 'wakeup']
onset_train_events['hour'].dropna().value_counts().sort_index().plot(kind='bar')

## Wiederholungen

In [None]:
participant_series_id = '5aad18e7ce64'

participant_series = train_series[train_series.series_id == participant_series_id].copy()[0:150_000]
participant_events = train_events[train_events.series_id == participant_series_id].copy()

participant_series['date'] = participant_series['timestamp'].str.split('T', expand=True)[0]
participant_series['time'] = participant_series['timestamp'].str.split('T', expand=True)[1].str.split('-', expand=True)[0]
participant_series['timestamp'] = pd.to_datetime(participant_series['date']+' '+participant_series['time'])

In [None]:
fig = px.line(participant_series, x='timestamp', y='enmo')
for index, row in participant_events.dropna().iterrows():
    color = 'orange' if row.event == 'onset' else 'red' 
    fig.add_vline(x=row.timestamp, line_width=3, line_dash='dash', line_color=color)

#Custom Legend
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='orange', width=2, dash='dash'),
        name="onset event",
    )
)
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='red', width=2, dash='dash'),
        name="wakeup event",
    )
)
fig.update_layout(legend_title="Events")
    
fig.show()

##  Unknown state

In [None]:
del train_series

gc.collect()

In [None]:
train_series = pd.read_parquet('../../data/processed/train.parquet')

In [None]:
participant_series_id = '038441c925bb'
participant_series = train_series[train_series['series_id'] == participant_series_id][0:120_000]
participant_events = train_events[train_events['series_id'] == participant_series_id][0:12]

In [None]:
fig = px.line(participant_series, x='timestamp', y='anglez')
for index, row in participant_events.dropna().iterrows():
    color = 'orange' if row.event == 'onset' else 'red' 
    fig.add_vline(x=row.timestamp, line_width=3, line_dash='dash', line_color=color)

last_step = -1
first_timestamp = None
unknown_events = participant_series[participant_series['unknown'] == 1]
unknown_events = unknown_events.reset_index()
last_index = unknown_events.shape[0] - 1

for index, row in unknown_events.iterrows():
    if (last_step + 1) < row['step'] and first_timestamp == None:
        first_timestamp = row['timestamp']
        last_step = row['step']
    elif (last_step + 1) == row['step']:
        last_step = row['step']
    else:
        last_real = participant_series[participant_series['step'] == last_step]
        fig.add_vrect(x0=first_timestamp, x1=last_real.iloc[0]['timestamp'], 
                    annotation_text="unknown", annotation_position="top left",
                    fillcolor="magenta", opacity=0.25, line_width=0)
        last_step = row['step']
        first_timestamp = row['timestamp']
    
    if last_index == index:
        fig.add_vrect(x0=first_timestamp, x1=row['timestamp'], 
                    annotation_text="unknown", annotation_position="top left",
                    fillcolor="magenta", opacity=0.25, line_width=0)

#Custom Legend
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='orange', width=2, dash='dash'),
        name="onset event",
    )
)
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        line=dict(color='red', width=2, dash='dash'),
        name="wakeup event",
    )
)
fig.update_layout(legend_title="Events", width=1500, height=400)

fig.show()