First step is preprocessing. For now, to begin, we will focus solely on S2, and on dividing the dictionary into segmented windows of 60s, with 30s of step/overlap (this could in theory be modified as a sort of data hyperparam).

Preprocessing includes 3-97% winsorization, removing extreme values from data. It would also be beneficial for performance to downsample signals (700 Hz is far too much). For Respibann, all 700Hz signals can be downsampled. ECG to 70Hz, ACC & EMG to 10Hz, EDA & TEMP & RESP to 3.5 Hz. All Empatica sampling is reasonable enough without change. I am skipping the butterworth low-pass feature.

Finally, I apply min-max normalization

In [35]:
# first, open the pkl file
import pickle
import numpy as np

subject = 2

file_path = f"WESAD/S{subject}/S{subject}.pkl"
with open(file_path, 'rb') as file:
    data = pickle.load(file, encoding='latin1')

In [36]:
print(data['signal']['chest'].keys())
print(data['signal']['wrist'].keys())
print(len(data['label'])) # by default, labels are also sampled at 700 Hz.
print(len(data['signal']['chest']['ECG']))
data['signal']['chest']['ECG']

dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
dict_keys(['ACC', 'BVP', 'EDA', 'TEMP'])
4255300
4255300


array([[ 0.02142334],
       [ 0.02032471],
       [ 0.01652527],
       ...,
       [-0.00544739],
       [ 0.00013733],
       [ 0.0040741 ]])

In [37]:
from scipy.stats import mstats

#downsampling frequency factors
downsampling_factor = {
    "chest/ECG": 10,
    "chest/ACC": 70,
    "chest/EMG": 70,
    "chest/EDA": 200,
    "chest/Temp": 200,
    "chest/Resp": 200,
    "wrist/ACC": 4,
    "wrist/BVP": 1,
    "wrist/EDA": 1,
    "wrist/TEMP": 1,
}

#3% winsorization + min-max normalization of signals in data
def preprocess(arrayobj, device, signal):
    arrayobj = mstats.winsorize(arrayobj, limits = [0.03,0.03])
    arrayobj = arrayobj[::downsampling_factor[f"{device}/{signal}"]]
    min_value, max_value = np.min(arrayobj), np.max(arrayobj)
    arrayobj = (arrayobj - min_value) / (max_value - min_value)
    return arrayobj


for device in data['signal'].keys():
    for signal in data['signal'][device].keys():
        print(f"Old range for {device}/{signal}:")
        print([np.min(data['signal'][device][signal]), np.max(data['signal'][device][signal])])
        data['signal'][device][signal] = preprocess(data['signal'][device][signal], device, signal)
        print(f"New range for {device}/{signal}:")
        print([np.min(data['signal'][device][signal]), np.max(data['signal'][device][signal])])

Old range for chest/ACC:
[np.float64(-1.1354000568389893), np.float64(2.0297999382019043)]
New range for chest/ACC:
[np.float64(0.0), np.float64(1.0)]
Old range for chest/ECG:
[np.float64(-1.499542236328125), np.float64(1.4993133544921875)]
New range for chest/ECG:
[np.float64(0.0), np.float64(1.0)]
Old range for chest/EMG:
[np.float64(-0.4149627685546875), np.float64(0.3009796142578125)]
New range for chest/EMG:
[np.float64(0.0), np.float64(1.0)]
Old range for chest/EDA:
[np.float64(0.263214111328125), np.float64(7.576751708984375)]
New range for chest/EDA:
[np.float64(0.0), np.float64(1.0)]
Old range for chest/Temp:
[np.float32(28.045258), np.float32(34.37039)]
New range for chest/Temp:
[np.float32(0.0), np.float32(1.0)]
Old range for chest/Resp:
[np.float64(-27.90374755859375), np.float64(27.37884521484375)]
New range for chest/Resp:
[np.float64(0.0), np.float64(1.0)]
Old range for wrist/ACC:
[np.float64(-128.0), np.float64(127.0)]
New range for wrist/ACC:
[np.float64(0.0), np.float

In [38]:
print(len(data['label']))
print(np.bincount(data['label']))


4255300
[2142701  800800  430500  253400  537599       0   45500   44800]


Now the task is to divide the data and labels into windows, and then save the windows to a python list object, which can then be pickled for easy reloading later.

In [20]:
windows = []
frequencies = {
    "chest/ECG": 70,
    "chest/ACC": 10,
    "chest/EMG": 10,
    "chest/EDA": 3.5,
    "chest/Temp": 3.5,
    "chest/Resp": 3.5,
    "wrist/ACC": 8,
    "wrist/BVP": 64,
    "wrist/EDA": 4,
    "wrist/TEMP": 4,
    
}

# labels are collected at frequency of 700, so we are looking to divide into windows of 60 * 700 and steps of 30 * 700 Hz

labelfreq = 700
windowsize = 60
windowstep = 30

for i in range((len(data["label"]) // (windowstep * labelfreq)) - 2):
    datawindow = {}
    datawindow['signal'] = {}
    for device in data['signal'].keys():
        datawindow['signal'][device] = {}
        for signal in data['signal'][device].keys():
            

            fkey = f"{device}/{signal}"
            datawindow['signal'][device][signal] = np.array(data['signal'][device][signal][int(i 
                                    * frequencies[fkey] * windowstep):int(i * frequencies[fkey] * windowstep + frequencies[fkey] * windowsize)])
    # print(i)
    
    labels = data['label'][int(i * labelfreq * windowstep): int(i * labelfreq * windowstep + windowsize * labelfreq)]
    # print(labels)
    # print(np.bincount(labels))
    label = np.bincount(labels).argmax() # finds the most common element of an integer array
    
    datawindow['label'] = label
    if (label in [0, 4, 5, 6, 7]): # ignoring labels 0, 5, 6, 7. May consider removing label 4 as well, as I usually do not see meditation considered.
        continue


    datawindow['subject'] = data['subject']
    windows.append(datawindow)
print("done")


done


In [21]:
for datawindow in windows:
    print(int(datawindow['label']))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3


In [22]:
with open(f'windows/S{subject}windows.pkl', 'wb') as file:
    pickle.dump(windows, file)

print("List has been pickled and saved to f'windows/S{subject}windows.pkl'.")

List has been pickled and saved to f'windows/S{subject}windows.pkl'.


Now, each window is stored as a dictionary in the .pkl file corresponding to the subject. This window data can be opened and used in training. For the data of more subjects, just change the subject variable

Below is a loop to create a pkl list for all windows across all subjects.

In [39]:
import pickle
import numpy as np
from scipy.stats import mstats


subjects = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
windows = []

for subject in subjects:

    file_path = f"WESAD/S{subject}/S{subject}.pkl"
    with open(file_path, 'rb') as file:
        data = pickle.load(file, encoding='latin1')
    
    downsampling_factor = {
        "chest/ECG": 10,
        "chest/ACC": 70,
        "chest/EMG": 70,
        "chest/EDA": 200,
        "chest/Temp": 200,
        "chest/Resp": 200,
        "wrist/ACC": 4,
        "wrist/BVP": 1,
        "wrist/EDA": 1,
        "wrist/TEMP": 1,
    }

    #3% winsorization + min-max normalization of signals in data
    def preprocess(arrayobj, device, signal):
        arrayobj = mstats.winsorize(arrayobj, limits = [0.03,0.03])
        arrayobj = arrayobj[::downsampling_factor[f"{device}/{signal}"]]
        min_value, max_value = np.min(arrayobj), np.max(arrayobj)
        arrayobj = (arrayobj - min_value) / (max_value - min_value)
        return arrayobj


    for device in data['signal'].keys():
        for signal in data['signal'][device].keys():
            # print(f"Old range for {device}/{signal}:")
            # print([np.min(data['signal'][device][signal]), np.max(data['signal'][device][signal])])
            data['signal'][device][signal] = preprocess(data['signal'][device][signal], device, signal)
            # print(f"New range for {device}/{signal}:")
            # print([np.min(data['signal'][device][signal]), np.max(data['signal'][device][signal])])
    
    frequencies = {
        "chest/ECG": 70,
        "chest/ACC": 10,
        "chest/EMG": 10,
        "chest/EDA": 3.5,
        "chest/Temp": 3.5,
        "chest/Resp": 3.5,
        "wrist/ACC": 8,
        "wrist/BVP": 64,
        "wrist/EDA": 4,
        "wrist/TEMP": 4,
        
    }

    # labels are collected at frequency of 700, so we are looking to divide into windows of 60 * 700 and steps of 30 * 700 Hz

    labelfreq = 700
    # in seconds
    windowsize = 60
    windowstep = 30

    for i in range((len(data["label"]) // (windowstep * labelfreq)) - 2):
        datawindow = {}
        datawindow['signal'] = {}
        for device in data['signal'].keys():
            datawindow['signal'][device] = {}
            for signal in data['signal'][device].keys():
                

                fkey = f"{device}/{signal}"
                datawindow['signal'][device][signal] = np.array(data['signal'][device][signal][int(i * frequencies[fkey] * windowstep):
                                                                                               int(i * frequencies[fkey] * windowstep + 
                                                                                                   frequencies[fkey] 
                                                                                                   * windowsize)])
        # print(i)
        
        labels = data['label'][int(i * labelfreq * windowstep): int(i * labelfreq * windowstep + windowsize * labelfreq)]
        # print(labels)
        # print(np.bincount(labels))
        label = np.bincount(labels).argmax() # finds the most common element of an integer array
        
        datawindow['label'] = label
        if (label in [0, 4, 5, 6, 7]): # ignoring labels 0, 5, 6, 7. May consider removing label 4 as well, as I usually do not see meditation considered.
            continue


        datawindow['subject'] = data['subject']
        windows.append(datawindow)
    print("done")
    
with open(f'allwindows.pkl', 'wb') as file:
    pickle.dump(windows, file)

print("List has been pickled and saved to f'allwindows.pkl'.")

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
List has been pickled and saved to f'allwindows.pkl'.


In [40]:
print(len(windows)) # 1105 training windows that fall into 1 of the 3 classes

3320


In [41]:
amusement = 0
stress = 0
baseline = 0
for window in windows:
    if (window['label'] == 1.):
        baseline += 1
    elif (window['label'] == 2.):
        stress += 1
    else:
        amusement += 1

print(baseline)
print(stress)
print(amusement)

1761
1000
559
