In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
path = "/Users/phil/philclarkphd/sleep/sleep_data/feature_store/features_2024-06-19_14-52-26.csv"
df = pd.read_csv(path, index_col=0)
df.head()

Unnamed: 0,EEG_std,EEG_ss,EEG_amp,EMG_std,EMG_ss,EMG_events,delta_rel,theta_rel,theta_over_delta,ID,day,ID_day,epoch,score
0,1.808552,3.270094,1.869975,0.540343,0.292754,0.0,0.210136,0.186241,0.886288,ZZ01,BL,ZZ01_BL,0,Non REM
1,1.043677,1.088574,1.069317,0.455598,0.20757,1.0,0.338948,0.200555,0.591698,ZZ01,BL,ZZ01_BL,1,Non REM
2,1.510736,2.280886,1.478451,0.471602,0.222428,1.0,0.257066,0.262096,1.01957,ZZ01,BL,ZZ01_BL,2,Non REM
3,1.437728,2.067189,1.450124,0.478292,0.228766,0.0,0.271698,0.144509,0.531874,ZZ01,BL,ZZ01_BL,3,Non REM
4,1.669482,2.786269,1.710151,0.477523,0.228029,0.0,0.222431,0.164077,0.737653,ZZ01,BL,ZZ01_BL,4,Non REM


In [3]:
df.shape

(42802, 14)

In [4]:
def train_test_split(
    df: pd.DataFrame,
    train_size: float = 0.8,
    time_series_idx: str = "epoch",
    group_col: str = "ID_day",
) -> tuple:
    """
    Splits data into train and test sets. Splitting is done equally across all values in the group_col.
    Splits done in time-series fashion - e.g. both train and test are continuous values wrt the time_series_idx.

    Args:
        df (pd.DataFrame):  A pandas dataframe with the data, group var, and some time-series indicator.
        train_size (float): Proportion (between 0 and 1) of data to ues in the training set. Test set will be the
        remainder. Defaults to 0.8.
        time_series_idx (str): Column name to use to order rows prior to splitting train and test. Defaults to 'epoch'.
        group_col (str): Column name to use for grouping data to ensure equal sampling across subjects. Defaults to
        'ID_day'.

    Returns:
        (pd.DataFrame): training data.
        (pd.DataFrame): test data.
    """

    # First enforce correct order of time_col by sorting the values within each group of group_col
    df = df.sort_values(by=[group_col, time_series_idx])

    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    
    for group in df[group_col].unique():
        group_df = df.loc[df[group_col] == group]
        total_rows = len(group_df)
        train_rows = total_rows * train_size
        
        train_set = group_df.loc[:train_rows]
        test_set = group_df.loc[train_rows:]
    
        train_df = pd.concat([train_df,train_set])
        test_df = pd.concat([test_df,test_set])

    return train_df, test_df

In [5]:
train, test = train_test_split(df)

In [26]:
train.score.value_counts()

score
Wake        15508
Non REM     13786
REM          3027
Unscored     1924
Name: count, dtype: int64

In [27]:
test.score.value_counts()

score
Wake        6072
Non REM     1405
Unscored     950
REM          133
Name: count, dtype: int64