# Daten Aufbereitung

In [1]:
import pandas as pd
import numpy as np

import gc

## Daten laden

In [2]:
events = pd.read_csv('../../data/raw/events.csv')

## Daten zusammenführen

In [3]:
def get_awake_asleep_label(series, events):
    events['awake'] = events['event'].replace({'onset': 1, 'wakeup': 0})

    # Merge Series with events and fill empty values with last observed event (awake until onset event...)
    merged = pd.merge(series, events[['step', 'awake']], on='step', how='left')
    merged['awake'] = merged['awake'].bfill(axis='rows')
    merged['awake'] = merged['awake'].fillna(1) # awake
    merged["awake"] = merged["awake"].astype("int")
    
    return merged

In [4]:
def get_critical_point_label(series, events):
    series['onset_critical_event_point'] = 0.0
    series['wakeup_critical_event_point'] = 0.0
    gaussian_distribution = generate_gaussian_distribution()
    
    for row in events[events.event == 'onset'].iterrows():
        current_step = row[1].step
    
        min_step = int(current_step - (720/2))
        max_step = min(int(current_step + (720/2)), len(series) - 1)
           
        series.loc[min_step:max_step, 'onset_critical_event_point'] = gaussian_distribution[:(max_step-min_step) + 1]
    
    for row in events[events.event == 'wakeup'].iterrows():
        current_step = row[1].step
    
        min_step = int(current_step - (720/2))
        max_step = min(int(current_step + (720/2)), len(series) - 1)
           
        series.loc[min_step:max_step, 'wakeup_critical_event_point'] = gaussian_distribution[:(max_step-min_step) + 1]
    
    return series

def generate_gaussian_distribution():
    # Parameters
    num_points = 720 + 1 # 12 * 60
    max_value = 1.0
    mean = num_points / 2  # Place the peak at the center
    std_dev = num_points / 6  # Adjust the standard deviation as needed

    x = np.arange(num_points)
    y = max_value * np.exp(-(x - mean)**2 / (2 * std_dev**2))

    return y

In [5]:
def get_merged_series(series_id, events):
    series = pd.read_parquet('../../data/raw/series.parquet', filters=[('series_id','=',series_id)])
    events_filtered = events.query('series_id == @series_id')
    
    events_filtered = events_filtered.dropna()
    events_filtered['step'] = events_filtered['step'].astype('int')
    
    series = get_awake_asleep_label(series, events_filtered)
    series = get_critical_point_label(series, events_filtered)

    
    return series

In [6]:
%%time

train_data = []

total_len = events.series_id.nunique()

for i, series_id in enumerate(events.series_id.unique()):
    print(f'Step {i+1} of {total_len} ({series_id})')
    train = get_merged_series(series_id, events)
    train_data.append(train)
    del train
    gc.collect()

train = pd.concat(train_data).reset_index(drop=True)

Step 1 of 277 (038441c925bb)
Step 2 of 277 (03d92c9f6f8a)
Step 3 of 277 (0402a003dae9)
Step 4 of 277 (04f547b8017d)
Step 5 of 277 (05e1944c3818)
Step 6 of 277 (062cae666e2a)
Step 7 of 277 (062dbd4c95e6)
Step 8 of 277 (08db4255286f)
Step 9 of 277 (0a96f4993bd7)
Step 10 of 277 (0cd1e3d0ed95)
Step 11 of 277 (0ce74d6d2106)
Step 12 of 277 (0cfc06c129cc)
Step 13 of 277 (0d0ad1e77851)
Step 14 of 277 (0dee4fda51c3)
Step 15 of 277 (0ec9fc461819)
Step 16 of 277 (0ef7d94fde99)
Step 17 of 277 (0f572d690310)
Step 18 of 277 (0f9e60a8e56d)
Step 19 of 277 (10469f6765bf)
Step 20 of 277 (1087d7b0ff2e)
Step 21 of 277 (10f8bc1f7b07)
Step 22 of 277 (12d01911d509)
Step 23 of 277 (1319a1935f48)
Step 24 of 277 (137771d19ca2)
Step 25 of 277 (137b99e936ab)
Step 26 of 277 (13b4d6a01d27)
Step 27 of 277 (148471991ffb)
Step 28 of 277 (154fe824ed87)
Step 29 of 277 (16fe2798ed0f)
Step 30 of 277 (1716cd4163b2)
Step 31 of 277 (1762ab70ec76)
Step 32 of 277 (188d4b7cd28b)
Step 33 of 277 (18a0ca03431d)
Step 34 of 277 (18b

Step 269 of 277 (f7eb179216c2)
Step 270 of 277 (f88e18cb4100)
Step 271 of 277 (f8a8da8bdd00)
Step 272 of 277 (f981a0805fd0)
Step 273 of 277 (fa149c3c4bde)
Step 274 of 277 (fb223ed2278c)
Step 275 of 277 (fbf33b1a2c10)
Step 276 of 277 (fcca183903b7)
Step 277 of 277 (fe90110788d2)
CPU times: total: 2min 2s
Wall time: 2min 50s


In [7]:
train.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,num_series_id,awake,onset_critical_event_point,wakeup_critical_event_point
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217,1,1,0.0,0.0
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215,1,1,0.0,0.0
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216,1,1,0.0,0.0
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213,1,1,0.0,0.0
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215,1,1,0.0,0.0


## Aufbereitete Daten speichern

In [8]:
train.to_parquet('../../data/raw/series_merged.parquet')