© 2024 Nokia
Licensed under the BSD 3 Clause Clear License  
SPDX-License-Identifier: BSD-3-Clause-Clear

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import torch
import os

In [37]:
timesteps = 100
channels = 1

In [38]:
demographics = pd.read_csv("..\\datasets\\MESA\\demographics.csv", delimiter=";")
demographics = demographics[['mesaid', 'nsrr_age', 'nsrr_sex', 'nsrr_race']]
demographics.drop_duplicates(subset=['mesaid'], inplace=True)
demographics.head()

Unnamed: 0,mesaid,nsrr_age,nsrr_sex,nsrr_race
0,1,70.0,female,white
1,2,83.0,female,white
2,6,57.0,female,hispanic
3,10,57.0,male,white
4,12,80.0,male,white


In [39]:
train = pd.read_csv("..\\datasets\\MESA\\dftrain_task1.csv")
test = pd.read_csv("..\\datasets\\MESA\\dftest_task1.csv")

In [40]:
test.head(1000)

Unnamed: 0,mesaid,linetime,marker,interval,binterval,activity,whitelight,redlight,greenlight,bluelight,gt,gt_sleep_block,wake
0,1080,1900-01-01 23:00:00,0.0,REST-S,1,0.0,0.01,0.0049,0.000,0.00,True,1,0.0
1,1080,1900-01-01 23:00:30,0.0,REST-S,1,0.0,0.01,0.0049,0.000,0.00,True,1,0.0
2,1080,1900-01-01 23:01:00,0.0,REST-S,1,41.0,0.01,0.0049,0.000,0.00,True,1,1.0
3,1080,1900-01-01 23:01:30,0.0,REST-S,1,0.0,0.01,0.0049,0.000,0.00,True,1,0.0
4,1080,1900-01-01 23:02:00,0.0,REST-S,1,0.0,0.01,0.0049,0.000,0.00,True,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1080,1900-01-02 07:17:30,0.0,ACTIVE,0,0.0,10.87,0.7680,0.996,0.41,False,0,0.0
996,1080,1900-01-02 07:18:00,0.0,ACTIVE,0,186.0,43.24,3.4200,3.900,1.41,False,0,1.0
997,1080,1900-01-02 07:18:30,0.0,ACTIVE,0,0.0,82.76,8.5000,7.000,2.47,False,0,0.0
998,1080,1900-01-02 07:19:00,0.0,ACTIVE,0,0.0,84.10,8.5000,7.150,2.47,False,0,0.0


In [41]:
scaler = StandardScaler()
scaler.fit(train[["activity"]].fillna(0.0))

train["activity"] = scaler.transform(train[["activity"]].fillna(0.0))
test["activity"] = scaler.transform(test[["activity"]].fillna(0.0))

In [42]:
# split into train and validation
def split_data(df, percent=0.2):
    uids = df.mesaid.unique()
    np.random.seed(42)
    np.random.shuffle(uids)
    test_position = int(uids.shape[0] * percent)

    uids_test, uids_train = uids[:test_position], uids[test_position:]

    # Splits dataset into training and test sets.
    # train_idx = wholedf[wholedf["mesaid"].apply(lambda x: x in uids_train)].index
    # dftrain = wholedf.iloc[train_idx].copy()
    dftrain = df[df['mesaid'].isin(uids_train)]

    # test_idx = wholedf[wholedf["mesaid"].apply(lambda x: x in uids_test)].index
    # dftest = wholedf.iloc[test_idx].copy()
    dftest = df[df['mesaid'].isin(uids_test)]
    return dftrain, dftest
train, val = split_data(train)

In [43]:
merged_train = train.merge(demographics, how='left', on='mesaid')
train_listfile = merged_train[['mesaid', 'wake']]
merged_val = val.merge(demographics, how='left', on='mesaid')
val_listfile = merged_val[['mesaid', 'wake']]
merged_test = test.merge(demographics, how='left', on='mesaid')
test_listfile = merged_test[['mesaid', 'wake']]

In [44]:
train_listfile.to_csv("..\\datasets\\MESA\\train_listfile.csv", index=False)
val_listfile.to_csv("..\\datasets\\MESA\\val_listfile.csv", index=False)
test_listfile.to_csv("..\\datasets\\MESA\\test_listfile.csv", index=False)

In [45]:
print("Shapes:\nTrain: ({}, {}) - Validation: ({}, {}) - Test: ({}, {})".format(train.shape[0], train.shape[1], val.shape[0], val.shape[1], test.shape[0], test.shape[1]))

Shapes:
Train: (1449147, 13) - Validation: (365497, 13) - Test: (452015, 13)


In [92]:
from time import sleep
def extract_x_y(df, seq_len, mesaid, feature="activity"):
    df = df[df["mesaid"] == mesaid][[feature, "gt"]].copy()
    # print(df)

    range_upper = int(seq_len/2 + 1)
    for s in range(1, range_upper):
	    df["shift_%d" % (s)] = df[feature].shift(s)

    for s in range(1, range_upper):
	    df["shift_-%d" % (s)] = df[feature].shift(-s)

    y = df["gt"]
    y = np.array([[1] if v else [0] for v in y])
    del df["gt"]
    x = df.fillna(-1).values
    return x,y

def get_data(df, seq_len):
    mesaids = df.mesaid.unique()
    features = ["activity", "whitelight", "redlight", "greenlight", "bluelight"]
    # 1st feature: activity
    print("Feature: {}".format(features[0]))
    x_, y_ = extract_x_y(df, seq_len, mesaids[0], feature=features[0])
    for mid in tqdm(mesaids[1:]):
        x_tmp, y_tmp = extract_x_y(df, seq_len, mid, feature=features[0])
        x_ = np.concatenate((x_, x_tmp))
        y_ = np.concatenate((y_, y_tmp))
    x_channels = x_
    x_channels = np.expand_dims(x_channels, axis=2)

    # remaining features
    for feature in features[1:]:
        print("Feature: {}".format(feature))
        x_, y_ = extract_x_y(df, seq_len, mesaids[0])
        for mid in tqdm(mesaids[1:]):
            x_tmp, y_tmp = extract_x_y(df, seq_len, mid, feature=feature)
            x_ = np.concatenate((x_, x_tmp))
            y_ = np.concatenate((y_, y_tmp))
        x_ = np.expand_dims(x_, axis=2)
        x_channels = np.concatenate([x_channels, x_], -1)
    return x_channels, y_

In [93]:
print("\nWindowing training data...\n")
x_train, y_train = get_data(train, timesteps)
print("\nWindowing validation data...\n")
x_val, y_val = get_data(val, timesteps)
print("\nWindowing test data...\n")
x_test, y_test = get_data(test, timesteps)

# print("Reshaping...")
# x_train = np.reshape(x_train, x_train.shape + (1,))
# x_val = np.reshape(x_val, x_val.shape + (1,))
# x_test = np.reshape(x_test, x_test.shape + (1,))


Windowing training data...

Feature: activity


100%|██████████| 1163/1163 [03:17<00:00,  5.89it/s]


Feature: whitelight


100%|██████████| 1163/1163 [03:19<00:00,  5.83it/s]


Feature: redlight


100%|██████████| 1163/1163 [03:18<00:00,  5.86it/s]


Feature: greenlight


100%|██████████| 1163/1163 [03:18<00:00,  5.87it/s]


Feature: bluelight


100%|██████████| 1163/1163 [03:08<00:00,  6.18it/s]



Windowing validation data...

Feature: activity


100%|██████████| 289/289 [00:18<00:00, 15.29it/s]


Feature: whitelight


100%|██████████| 289/289 [00:17<00:00, 16.08it/s]


Feature: redlight


100%|██████████| 289/289 [00:21<00:00, 13.62it/s]


Feature: greenlight


100%|██████████| 289/289 [00:19<00:00, 14.64it/s]


Feature: bluelight


100%|██████████| 289/289 [00:18<00:00, 15.49it/s]



Windowing test data...

Feature: activity


100%|██████████| 362/362 [00:27<00:00, 13.08it/s]


Feature: whitelight


100%|██████████| 362/362 [00:28<00:00, 12.80it/s]


Feature: redlight


100%|██████████| 362/362 [00:26<00:00, 13.80it/s]


Feature: greenlight


100%|██████████| 362/362 [00:26<00:00, 13.88it/s]


Feature: bluelight


100%|██████████| 362/362 [00:26<00:00, 13.82it/s]


In [94]:
print("Shapes:\nTrain: ({}) - Validation: ({}) - Test: ({})".format(x_train.shape, x_val.shape, x_test.shape))

Shapes:
Train: ((1449147, 101, 5)) - Validation: ((365497, 101, 5)) - Test: ((452015, 101, 5))


In [95]:
print("Label Distribution")
test['gt'].value_counts()

Label Distribution


True     262499
False    189516
Name: gt, dtype: int64

In [96]:
# TRAIN
train_samples = torch.from_numpy(x_train)
train_samples = torch.permute(train_samples, (0, 2, 1))
train_labels = torch.from_numpy(np.asarray(y_train).squeeze())
train_tensor = {'samples': train_samples, 'labels': train_labels}

# VAL
val_samples = torch.from_numpy(x_val)
val_samples = torch.permute(val_samples, (0, 2, 1))
val_labels = torch.from_numpy(np.asarray(y_val).squeeze())
val_tensor = {'samples': val_samples, 'labels': val_labels}

# TEST
test_samples = torch.from_numpy(x_test)
test_samples = torch.permute(test_samples, (0, 2, 1))
test_labels = torch.from_numpy(np.asarray(y_test).squeeze())
test_tensor = {'samples': test_samples, 'labels': test_labels}

In [97]:
# SAVE AS .PT
path = "..\\datasets\\MESA\\"
torch.save(train_tensor, os.path.join(path, "train.pt"))
torch.save(val_tensor, os.path.join(path, "val.pt"))
torch.save(test_tensor, os.path.join(path, "test.pt"))