In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl

from tqdm import tqdm

In [2]:
# Extended signals data
timestamps = ([f"2022-01-01 {hour:02d}:00:00" for hour in range(24)] + 
             ["2022-01-02 00:00:00"] +
             [f"2022-01-02 {hour:02d}:00:00" for hour in range(1, 13)])

df_signals = pl.DataFrame({
    "series_id": ["A"] * 37 + ["B"] * 37 + ["C"] * 37,
    "timestamp": timestamps * 3,
    "step": list(range(1, 38)) * 3
})

# Extended events data
df_events = pl.DataFrame({
    "series_id": ["A", "A", "A", "A", "B", "B", "C"],
    "night": [1, 1, 1, 1, 1, 1, 1],
    "event": ["onset", "wakeup", "onset", "wakeup", "onset", "wakeup", "onset"],
    "timestamp": ["2022-01-01 02:00:00", "2022-01-01 14:00:00","2022-01-01 22:00:00", "2022-01-02 08:00:00", "2022-01-01 03:00:00",
                 "2022-01-01 15:00:00", "2022-01-01 04:00:00"],
    "step": [3, 15, 23, 33, 4, 16, 5]
})


In [3]:
#df_signals.write_csv("signals.csv")
df_events.write_csv("events.csv")

In [4]:
# # Detect and remove mismatched onsets/wakeups
# mismatches = df_events.group_by(['series_id', 'night']).agg(
#     (pl.col('event') == 'onset').sum().alias('onset'),
#     (pl.col('event') == 'wakeup').sum().alias('wakeup')
# ).filter(pl.col('onset') != pl.col('wakeup')).select(pl.all().exclude('onset', 'wakeup'))
# df_events = df_events.join(mismatches, on=['series_id', 'night'], how='anti')


In [5]:
df_events

series_id,night,event,timestamp,step
str,i64,str,str,i64
"""A""",1,"""onset""","""2022-01-01 02:…",3
"""A""",1,"""wakeup""","""2022-01-01 14:…",15
"""A""",1,"""onset""","""2022-01-01 22:…",23
"""A""",1,"""wakeup""","""2022-01-02 08:…",33
"""B""",1,"""onset""","""2022-01-01 03:…",4
"""B""",1,"""wakeup""","""2022-01-01 15:…",16
"""C""",1,"""onset""","""2022-01-01 04:…",5


In [9]:
events_wide = df_events.pivot(index=['series_id'], columns='event', values='timestamp', aggregate_function='first')
result = df_signals.join(events_wide, on='series_id').with_columns(
    state = pl.col('timestamp').is_between('onset', 'wakeup')
)
print(events_wide.head())

shape: (3, 3)
┌───────────┬─────────────────────┬─────────────────────┐
│ series_id ┆ onset               ┆ wakeup              │
│ ---       ┆ ---                 ┆ ---                 │
│ str       ┆ str                 ┆ str                 │
╞═══════════╪═════════════════════╪═════════════════════╡
│ A         ┆ 2022-01-01 02:00:00 ┆ 2022-01-01 14:00:00 │
│ B         ┆ 2022-01-01 03:00:00 ┆ 2022-01-01 15:00:00 │
│ C         ┆ 2022-01-01 04:00:00 ┆ null                │
└───────────┴─────────────────────┴─────────────────────┘


In [31]:
df_signals=df_signals.with_columns(
    timestamp=pl.col('timestamp').str.strptime(pl.Datetime,"%Y-%m-%d %H:%M:%S")
    )
df_events=df_events.with_columns(
    timestamp=pl.col('timestamp').str.strptime(pl.Datetime,"%Y-%m-%d %H:%M:%S")
    )

In [48]:
a = (
    df_signals
    .join_asof(
        df_events,
        on='step',
        by='series_id',
        strategy='backward',
        )
    .with_columns(
        state= pl.when((pl.col('event')=='wakeup')).then(True).otherwise(False).cast(pl.Boolean)
        )
).write_csv("test.csv")

In [40]:
a.write_csv("test.csv")

In [9]:
# Add onset_step and wakeup_step to df_events
df_events = df_events.with_columns(
    onset_step = pl.when(pl.col('event') == 'onset').then(pl.col('step')),
    wakeup_step = pl.when(pl.col('event') == 'wakeup').then(pl.col('step'))
)

# Merge df_signals and df_events
df = df_signals.join(df_events, on=['series_id', 'timestamp', 'step'], how='left')

# Sort by series_id and step
df = df.sort(['series_id', 'step'])

# Forward fill onset_step and wakeup_step
df = df.with_columns(
    onset_step = pl.col('onset_step').backward_fill(),
    wakeup_step = pl.col('wakeup_step').backward_fill()
)

# Calculate state column
df = df.with_columns(
    state = pl.when(
        (pl.col('step') >= pl.col('onset_step')) 
        & 
        (pl.col('step') < pl.col('wakeup_step'))
        ).then(False).fill_null(True)
)

print(df.drop('night','timestamp'))
df.write_csv('test.csv')

shape: (111, 6)
┌───────────┬──────┬───────┬────────────┬─────────────┬───────┐
│ series_id ┆ step ┆ event ┆ onset_step ┆ wakeup_step ┆ state │
│ ---       ┆ ---  ┆ ---   ┆ ---        ┆ ---         ┆ ---   │
│ str       ┆ i64  ┆ str   ┆ i64        ┆ i64         ┆ bool  │
╞═══════════╪══════╪═══════╪════════════╪═════════════╪═══════╡
│ A         ┆ 1    ┆ null  ┆ 3          ┆ 15          ┆ true  │
│ A         ┆ 2    ┆ null  ┆ 3          ┆ 15          ┆ true  │
│ A         ┆ 3    ┆ onset ┆ 3          ┆ 15          ┆ false │
│ A         ┆ 4    ┆ null  ┆ 23         ┆ 15          ┆ true  │
│ …         ┆ …    ┆ …     ┆ …          ┆ …           ┆ …     │
│ C         ┆ 34   ┆ null  ┆ null       ┆ null        ┆ true  │
│ C         ┆ 35   ┆ null  ┆ null       ┆ null        ┆ true  │
│ C         ┆ 36   ┆ null  ┆ null       ┆ null        ┆ true  │
│ C         ┆ 37   ┆ null  ┆ null       ┆ null        ┆ true  │
└───────────┴──────┴───────┴────────────┴─────────────┴───────┘
