In [13]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import mne
import seaborn as sns
import nolds
from scipy import stats
from scipy.signal import welch
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score, balanced_accuracy_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from tqdm import tqdm

In [148]:
SAMPLING_RATE = 256
SEED = 42
labels_folder = "/Users/leluwy/Desktop/ETH/AICenterProjects/our_dataset/labels"
data_folder = "/Users/leluwy/Desktop/ETH/AICenterProjects/our_dataset"

In [149]:
SEED = 42
subjects = ['lea','bjoern','finn','sarah', 'aurora', 'derek', 'ronan', 'dimi'] # add all subjects here, the first two will be chosen for test

In [150]:
subj_data = {}
for subj in subjects:
    print(subj)
    df = pd.read_csv(labels_folder+"/events_" + subj + ".txt", delim_whitespace=True)
    df = df[(df.number != "condition")]
    subj_data[subj] = {}
    subj_data[subj]["labels"] = df["number"].to_numpy().astype(float)
    subj_data[subj]["timestamps"] = df["type"].to_numpy().astype(float)
    if subj == 'aurora': # aurora is another format
        df = pd.read_csv(data_folder+"/" + subj + "_pre_processed_data.txt", delim_whitespace=True)
    else:
        df = pd.read_csv(data_folder+"/" + subj + "_pre_processed_data.txt", delim_whitespace=False)
    subj_data[subj]["data"] = df

lea
bjoern
finn
sarah
aurora
derek
ronan
dimi


In [151]:
for x in subjects:
    if subj_data[x]['labels'][0] != 100:
        raise Exception("Something wrong with labels for " + x)

In [152]:
def split_data(data, labels, timestamps):
    def to_true_label(label):
        if label == 100:
            raise Exception("Must skip labels with value 100!")
        if label == 195:
            return 1
        if label == 196:
            return 2
        return 0
    
    x = []
    y = []
    start = timestamps[0]
    for i, label in enumerate(labels):
        if i == 0: continue
        end = timestamps[i]
        if label != 100:
            x.append(data[int(start):int(end)])
            y.append(to_true_label(label))
        start = timestamps[i]
    return (x,y)

In [153]:
processed_subjects = {}
for s in subjects:
    processed_subjects[s] = split_data(subj_data[s]['data'], subj_data[s]['labels'], subj_data[s]['timestamps'])

In [197]:
subjects_numpy = []
labels_numpy = []
max_len = int(SAMPLING_RATE*5)
for s in subjects:
    for i, parapgraph in enumerate(processed_subjects['lea'][0]):
        subjects_numpy.append(parapgraph.drop(columns=['Time']).to_numpy()[-max_len:].transpose())
        subjects_numpy.append(parapgraph.drop(columns=['Time']).to_numpy()[-2*max_len:-max_len].transpose())
        subjects_numpy.append(parapgraph.drop(columns=['Time']).to_numpy()[-3*max_len:-2*max_len].transpose())
        labels_numpy.append(np.array(processed_subjects['lea'][1][i]))
        labels_numpy.append(np.array(processed_subjects['lea'][1][i]))
        labels_numpy.append(np.array(processed_subjects['lea'][1][i]))
subjects_numpy = np.stack(subjects_numpy)
labels_numpy = np.stack(labels_numpy)

In [209]:
print(subjects_numpy.shape)

(696, 64, 1280)


In [212]:
# scaling
n_samples = 696
subjects_numpy_2d = subjects_numpy.reshape((n_samples, -1))
scaler = StandardScaler()
scaler.fit_transform(subjects_numpy_2d)
subjects_numpy = subjects_numpy_2d.reshape(subjects_numpy.shape)

In [213]:
# dataset
X_test = subjects_numpy[:174]
y_test = labels_numpy[:174]
X_train = subjects_numpy[174:]
y_train = labels_numpy[:174]

In [215]:
np.save("X", subjects_numpy)
np.save("y", labels_numpy)