### Installing dependencies

                                                    Project details                                                                
                                 
The data represents various brain activities: resting, math & story tasks, working memory, and motor tasks.

    The 'Intra' folder contains data from one subject, while the 'Cross' folder includes multiple subjects.

Each file is a matrix of shape 248 x 35624, where 248 represents the number of sensors, and 35624 represents time steps.

The files have the following format: “taskType subjectIdentifier number.h5”
where taskType can be rest, task motor, task story math, and task working memory.

In practice, these tasks correspond to the activities performed by the subjects:

    • Resting Task
Recording the subjects’ brain while in a relaxed resting
state.

    • Math & Story Task
Subject performs mental calculation and language
processing task.

    • Working Memory task
Subject performs a memorization task.

    • Motor Task
Subject performs a motor task, typically moving fingers
or feets

In [11]:
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch import FloatTensor, LongTensor
from typing import Tuple, List, Callable, Optional
from sklearn.metrics import accuracy_score
import os
import numpy as np
from tqdm import tqdm
import random
import pandas as pd

Reading data:

In [12]:
def get_dataset_name(file_name_with_dir):
    filename_without_dir = file_name_with_dir.split('/')[-1]
    temp = filename_without_dir.split('_')[:-1]
    dataset_name = "_".join(temp)
    return dataset_name

## Functions for data preprocessing

In [13]:
# min-max scaling
def minmax(trial):
    min = trial.min()
    max = trial.max()
    normalisedTrial = (trial - min)/(max-min)
    return normalisedTrial

#Z-score normalisation OPTIONAL
def zscore(trial):
    mean = trial.mean()
    sd = trial.std()
    normalisedTrial = (trial - mean)/sd 
    return normalisedTrial

#downsamples data by totaltimesteps/factor
def downsample(trial, factor):
    ds_trial = trial[:,::factor]
    return ds_trial



In [14]:
def preprocess_files(files = None, path = 'Final Project data/Cross/train', downsampling = 30):
    label_to_int = {'rest': 0, 'task_motor': 1, 'task_story_math': 2, 'task_working_memory': 3}

    cross_data_train = [] # Store data
    cross_data_train_labels = [] # Store labels (based on filename)

    if files == None:
        files = os.listdir(path)

    for file in files:
        file_path = f'{path}/{file}'
        
        with h5py.File(file_path, 'r') as h5_file:
            # obtain labels
            dataset_name = get_dataset_name(file_path)
            label = dataset_name.split('_')
            label.remove(label[len(label)-1])
            label = '_'.join(label)
            cross_data_train_labels.append(label_to_int[label])
            
            # obtain X_data
            matrix = h5_file.get(dataset_name)[()]
            normalisedMatrix = downsample(zscore(matrix), downsampling) # apply minmax normalisation and downsampling
            cross_data_train.append(normalisedMatrix.T) # Transpose
             
    X = torch.from_numpy(np.array(cross_data_train)).float()
    y = torch.tensor(cross_data_train_labels)        
            
    return X, y

## RNN model

In [15]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        # RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass through RNN
        rnn, _ = self.rnn(x)
        
        # Only take the output from the final time step
        output = self.fc(rnn[:, -1, :])
        return output
    

In [16]:
# hyperparams: lr, hidden_size, downsampling
def train(path, lr = 0.001, hidden_size = 200, downsampling = 30, print_results = True):
    random.seed = 123 # Set seed for reproducability
    input_size = 248
    output_size = 4
    network = RNN(input_size, hidden_size, output_size)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(network.parameters(), lr=lr)

    files = os.listdir(path)
    random.shuffle(files) # Shuffle order of files
    current_samples = []
    n = 8
    batch_index = 1

    for i, file in tqdm(enumerate(files)):
        current_samples.append(file)
        if len(current_samples) == n or i == (len(files)-1):
            if print_results:
                print(f"training batch {batch_index}...")
            X_train, y_train = preprocess_files(current_samples, path=path, downsampling=downsampling) 
            current_samples = []
            
            network.train()
            opt.zero_grad()
            output = network(X_train)
            loss = loss_fn(output, y_train)
            loss.backward()
            opt.step()
            
            batch_index += 1
    return network

In [17]:
# testing:
def test(network, paths):
    for path in paths:
        files = os.listdir(path)
        X, y = preprocess_files(files, path, 1)
        network.eval()

        test_output = network(X).detach().numpy()
        pred = np.argmax(test_output, axis=1) # to numpy
        y = y.numpy()
        return accuracy_score(pred, y)


In [18]:
def tune_hyperparams_rnn(data_type, lr_list, hidden_size_list, downsampling_list):
    path_training = f'Final Project data/{data_type}/train'
    if data_type == 'Cross':
        paths_testing = [ 'Final Project data/Cross/test1',  'Final Project data/Cross/test2',  'Final Project data/Cross/test3']
    else:
        paths_testing = ['Final Project data/Intra/test']
        
    results = pd.DataFrame(columns = ['lr', 'hidden_size', 'downsampling', 'acc'])
    
    for lr in lr_list:
        for hidden_size in hidden_size_list:
            for downsampling in downsampling_list:
                network = train(path_training, lr=lr, hidden_size=hidden_size, downsampling=downsampling, print_results=False)
                acc = test(network=network, paths=paths_testing)
                results = pd.concat([results, pd.DataFrame({'lr':[lr], 'hidden_size':[hidden_size], 'downsampling':[downsampling], 'acc':[acc]})])
    results.to_csv(f'results/{data_type}_results.csv')
    return results

In [19]:
tune_hyperparams_rnn('Cross', lr_list=[0.01, 0.001, 0.0001], hidden_size_list=[100, 150, 200, 250], downsampling_list=[1, 5, 15, 30])

0it [00:00, ?it/s]

64it [05:28,  5.13s/it]
  results = pd.concat([results, pd.DataFrame({'lr':[lr], 'hidden_size':[hidden_size], 'downsampling':[downsampling], 'acc':[acc]})])
64it [00:43,  1.48it/s]
64it [00:26,  2.38it/s]
64it [00:23,  2.70it/s]
64it [06:11,  5.80s/it]
64it [00:51,  1.25it/s]
64it [00:28,  2.22it/s]
64it [00:24,  2.63it/s]
64it [06:15,  5.87s/it]
64it [00:51,  1.24it/s]
64it [00:29,  2.19it/s]
64it [00:25,  2.47it/s]
64it [06:59,  6.56s/it]
64it [01:05,  1.03s/it]
64it [00:33,  1.89it/s]
64it [00:27,  2.31it/s]
64it [05:14,  4.92s/it]
64it [00:43,  1.48it/s]
64it [00:26,  2.40it/s]
64it [00:23,  2.69it/s]
64it [06:13,  5.84s/it]
64it [00:51,  1.24it/s]
64it [00:28,  2.21it/s]
64it [00:25,  2.53it/s]
64it [06:28,  6.07s/it]
64it [00:54,  1.17it/s]
64it [00:30,  2.13it/s]
64it [00:25,  2.55it/s]
64it [07:16,  6.83s/it]
64it [01:03,  1.00it/s]
64it [00:34,  1.86it/s]
64it [00:27,  2.31it/s]
64it [05:23,  5.06s/it]
64it [00:42,  1.49it/s]
64it [00:26,  2.39it/s]
64it [00:23,  2.68it/s]
64i

Unnamed: 0,lr,hidden_size,downsampling,acc
0,0.01,100,1,0.8125
0,0.01,100,5,0.625
0,0.01,100,15,0.625
0,0.01,100,30,0.875
0,0.01,150,1,0.5
0,0.01,150,5,0.75
0,0.01,150,15,0.875
0,0.01,150,30,0.5625
0,0.01,200,1,0.6875
0,0.01,200,5,0.6875


In [20]:
tune_hyperparams_rnn('Intra', lr_list=[0.01, 0.001, 0.0001], hidden_size_list=[100, 150, 200, 250], downsampling_list=[1, 5, 15, 30])

32it [02:42,  5.07s/it]
  results = pd.concat([results, pd.DataFrame({'lr':[lr], 'hidden_size':[hidden_size], 'downsampling':[downsampling], 'acc':[acc]})])
32it [00:22,  1.42it/s]
32it [00:14,  2.28it/s]
32it [00:12,  2.66it/s]
32it [03:06,  5.83s/it]
32it [00:26,  1.20it/s]
32it [00:14,  2.15it/s]
32it [00:11,  2.67it/s]
32it [03:21,  6.29s/it]
32it [00:28,  1.13it/s]
32it [00:14,  2.14it/s]
32it [00:13,  2.39it/s]
32it [03:35,  6.74s/it]
32it [00:31,  1.03it/s]
32it [00:16,  1.92it/s]
32it [00:14,  2.15it/s]
32it [02:42,  5.08s/it]
32it [00:22,  1.40it/s]
32it [00:13,  2.42it/s]
32it [00:11,  2.71it/s]
32it [03:08,  5.90s/it]
32it [00:25,  1.26it/s]
32it [00:14,  2.22it/s]
32it [00:13,  2.34it/s]
32it [03:17,  6.19s/it]
32it [00:28,  1.14it/s]
32it [00:15,  2.10it/s]
32it [00:12,  2.56it/s]
32it [03:36,  6.77s/it]
32it [00:30,  1.06it/s]
32it [00:16,  1.99it/s]
32it [00:13,  2.41it/s]
32it [02:39,  4.98s/it]
32it [00:22,  1.40it/s]
32it [00:13,  2.31it/s]
32it [00:12,  2.61it/s]
32i

Unnamed: 0,lr,hidden_size,downsampling,acc
0,0.01,100,1,1.0
0,0.01,100,5,1.0
0,0.01,100,15,1.0
0,0.01,100,30,1.0
0,0.01,150,1,1.0
0,0.01,150,5,1.0
0,0.01,150,15,1.0
0,0.01,150,30,1.0
0,0.01,200,1,1.0
0,0.01,200,5,1.0
