In [2]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path

In [6]:
PATH_DATA_RAW = Path("/beegfs/ws/0/s4610340-sleep_states/kaggle-detect_sleep_states/data/raw")

In [9]:
events_df = pd.read_csv(PATH_DATA_RAW / "train_events.csv")
events_df = events_df.pivot(columns="event", index=["series_id", "night"], values="step").dropna()
events_df

Unnamed: 0_level_0,event,onset,wakeup
series_id,night,Unnamed: 2_level_1,Unnamed: 3_level_1
038441c925bb,1,4992.0,10932.0
038441c925bb,2,20244.0,27492.0
038441c925bb,3,39996.0,44400.0
038441c925bb,4,57240.0,62856.0
038441c925bb,6,91296.0,97860.0
...,...,...,...
fe90110788d2,30,505116.0,511284.0
fe90110788d2,31,522852.0,529104.0
fe90110788d2,32,538956.0,547152.0
fe90110788d2,33,556560.0,560604.0


In [10]:
import hydra
import detect_sleep_states.config

with hydra.initialize_config_dir(version_base=None,
                                 config_dir="/beegfs/ws/0/s4610340-sleep_states/kaggle-detect_sleep_states/config"):
    cfg: detect_sleep_states.config.TrainConfig = hydra.compose(config_name="train")

cfg

{'seed': 42, 'exp_name': 'dummy', 'run_name': 'dummy', 'batch_size': 32, 'num_workers': 12, 'duration': 5760, 'downsample_rate': 2, 'upsample_rate': 1, 'n_chunks_visualize': 20, 'trainer': {'epochs': 50, 'accelerator': 'auto', 'use_amp': True, 'debug': False, 'gradient_clip_val': 1.0, 'accumulate_grad_batches': 1, 'monitor': 'val_loss', 'monitor_mode': 'min', 'check_val_every_n_epoch': 1}, 'aug': {'mixup_prob': 0.0, 'mixup_alpha': 0.4, 'cutmix_prob': 0.0, 'cutmix_alpha': 0.4}, 'pp': {'score_th': 0.1, 'distance': 360}, 'labels': ['awake', 'event_onset', 'event_wakeup'], 'target_labels_idx': [1, 2], 'features': ['anglez', 'enmo', 'hour_sin', 'hour_cos'], 'optimizer': {'lr': 0.0005}, 'scheduler': {'num_warmup_steps': 0}, 'dir': {'data_dir': '/beegfs/ws/0/s4610340-sleep_states/kaggle-detect_sleep_states/data/raw', 'processed_dir': '/beegfs/ws/0/s4610340-sleep_states/kaggle-detect_sleep_states/data/processed', 'output_dir': '/beegfs/ws/0/s4610340-sleep_states/kaggle-detect_sleep_states/outp

In [35]:
from tqdm import tqdm

series_lf = pl.scan_parquet(
    PATH_DATA_RAW / f"train_series.parquet",
)

# preprocess
series_df = (
    series_lf
    .with_columns(
        pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z"),
    )
    .filter(
        pl.col("series_id").is_in(cfg.split.train_series_ids)
    )
    .select(
        [
            pl.col("series_id"),
            pl.col("timestamp"),
            pl.col("step"),
            pl.col("anglez"),
            pl.col("enmo"),
        ]
    )
    .collect(streaming=True)
    .sort(by=["series_id", "timestamp"])
)
n_unique = series_df.get_column("series_id").n_unique()

counts = []

this_series_df: pl.DataFrame
for series_id, this_series_df in tqdm(series_df.group_by("series_id"), total=n_unique):
    series_counts = {
        "total_count": this_series_df.shape[0],
        "asleep_count": 0,
        "awake_count": 0,
        "onset_count": 0,
        "wakeup_count": 0
    }

    if series_id in events_df.index.get_level_values("series_id"):

        series_events = events_df.loc[series_id]

        series_counts["onset_count"] = series_events.shape[0]
        series_counts["wakeup_count"] = series_events.shape[0]

        for night, night_series in series_events.iterrows():
            series_counts["asleep_count"] += this_series_df.filter([
                pl.col("step").is_between(night_series["onset"], night_series["wakeup"], closed="none")
            ]).shape[0]

    series_counts["nulls"] = this_series_df.null_count().sum_horizontal().sum()

    series_counts["awake_count"] += (
                this_series_df.shape[0] - series_counts["asleep_count"] - series_counts["onset_count"] - series_counts[
            "wakeup_count"])

    counts.append(series_counts)

df_counts = pd.DataFrame(counts)
df_counts


100%|██████████| 221/221 [00:06<00:00, 31.73it/s]


Unnamed: 0,total_count,asleep_count,awake_count,onset_count,wakeup_count,nulls
0,391320,73665,317625,15,15,0
1,639000,41284,597700,8,8,0
2,778680,96874,681778,14,14,0
3,606240,199531,406651,29,29,0
4,759240,159607,599575,29,29,0
...,...,...,...,...,...,...
216,376380,158990,217346,22,22,0
217,405900,123473,282389,19,19,0
218,617400,181846,435502,26,26,0
219,354420,118722,235662,18,18,0


In [36]:
total_counts = df_counts.sum(axis=0)
total_counts

total_count     100363860
asleep_count     23841204
awake_count      76515024
onset_count          3816
wakeup_count         3816
nulls                   0
dtype: int64

In [27]:
total_counts["onset_count"] / (
            total_counts["wakeup_count"] + total_counts["awake_count"] + total_counts["asleep_count"])

3.802310010944196e-05

In [28]:
total_counts["asleep_count"] / (
            total_counts["wakeup_count"] + total_counts["awake_count"] + total_counts["awake_count"])

0.1557903811407389

In [29]:
total_counts["awake_count"] / (
            total_counts["wakeup_count"] + total_counts["awake_count"] + total_counts["asleep_count"])

0.7624052456573256

In [30]:
1 / (3.8e-05)

26315.78947368421

In [54]:
import numpy as np
from pandas.api.indexers import FixedForwardWindowIndexer

# preprocess

series_df = (
    series_lf
    .with_columns(
        pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z"),
    )
    .select(
        [
            pl.col("series_id"),
            pl.col("timestamp"),
            pl.col("step"),
            pl.col("anglez"),
            pl.col("enmo"),
        ]
    )
    .collect(streaming=True)
    .sort(by=["series_id", "timestamp"])
)

n_unique = series_df.get_column("series_id").n_unique()

df_train_events = pl.from_pandas(
    pd.read_csv(PATH_DATA_RAW / "train_events.csv")
    .dropna()
    .astype({
        "step": np.uint32
    })
).with_columns(
    pl.col("event").map_dict({"onset": 1, "wakeup": 0}).alias("awake")
)

window_size = 8640
th = 50

for series_id, this_series_df in tqdm(series_df.group_by("series_id"), total=n_unique):
    govno = this_series_df.get_column("anglez").to_pandas()
    govno_rolling = govno.rolling(FixedForwardWindowIndexer(window_size=window_size),
                                  min_periods=window_size).max().rename("anglez_forward_max")

    this_series_df = (
        this_series_df
        .with_columns(
            pl.col("anglez").rolling_max(window_size=window_size, min_periods=window_size).alias("anglez_backward_max"),
            pl.from_pandas(govno_rolling)
        )
        .join(df_train_events.select([pl.col("step", "awake")]), on="step", how="left")
        .with_columns(
            pl.col("awake").fill_null(strategy="backward"),
        )
    )
    this_series_df = (
        this_series_df
        .with_columns(
            pl
            .when((pl.col("anglez_backward_max") < th) | (pl.col("anglez_forward_max") < th))
            .then(pl.col("awake"))
            .alias("awake")
            .fill_null(1)
        )
        .select(
            "series_id",
            "timestamp",
            "step",
            "anglez",
            "enmo",
            pl.col("awake") != 2,
        )
        .drop(
            "awake"
        )
    )

    print(this_series_df)

    break

  0%|          | 0/277 [00:03<?, ?it/s]

shape: (622_366, 5)
┌──────────────┬─────────────────────────┬────────┬──────────┬────────┐
│ series_id    ┆ timestamp               ┆ step   ┆ anglez   ┆ enmo   │
│ ---          ┆ ---                     ┆ ---    ┆ ---      ┆ ---    │
│ str          ┆ datetime[μs, UTC]       ┆ u32    ┆ f32      ┆ f32    │
╞══════════════╪═════════════════════════╪════════╪══════════╪════════╡
│ 55b7f5c99930 ┆ 2018-11-13 16:45:00 UTC ┆ 0      ┆ -1.9646  ┆ 0.0544 │
│ 55b7f5c99930 ┆ 2018-11-13 16:45:05 UTC ┆ 1      ┆ -3.4437  ┆ 0.063  │
│ 55b7f5c99930 ┆ 2018-11-13 16:45:10 UTC ┆ 2      ┆ -5.4503  ┆ 0.0732 │
│ 55b7f5c99930 ┆ 2018-11-13 16:45:15 UTC ┆ 3      ┆ -12.8095 ┆ 0.054  │
│ …            ┆ …                       ┆ …      ┆ …        ┆ …      │
│ 55b7f5c99930 ┆ 2018-12-19 14:14:40 UTC ┆ 620276 ┆ -2.702   ┆ 0.0404 │
│ 55b7f5c99930 ┆ 2018-12-19 14:14:45 UTC ┆ 620277 ┆ -8.1532  ┆ 0.0444 │
│ 55b7f5c99930 ┆ 2018-12-19 14:14:50 UTC ┆ 620278 ┆ -3.346   ┆ 0.0748 │
│ 55b7f5c99930 ┆ 2018-12-19 14:14:55 UTC ┆ 6


