### Installing dependencies

                                                    Project details                                                                
                                 
The data represents various brain activities: resting, math & story tasks, working memory, and motor tasks.

    The 'Intra' folder contains data from one subject, while the 'Cross' folder includes multiple subjects.

Each file is a matrix of shape 248 x 35624, where 248 represents the number of sensors, and 35624 represents time steps.

The files have the following format: “taskType subjectIdentifier number.h5”
where taskType can be rest, task motor, task story math, and task working memory.

In practice, these tasks correspond to the activities performed by the subjects:

    • Resting Task
Recording the subjects’ brain while in a relaxed resting
state.

    • Math & Story Task
Subject performs mental calculation and language
processing task.

    • Working Memory task
Subject performs a memorization task.

    • Motor Task
Subject performs a motor task, typically moving fingers
or feets

In [10]:
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch import FloatTensor, LongTensor
from typing import Tuple, List, Callable, Optional
from sklearn.metrics import accuracy_score
import os
import numpy as np
from tqdm import tqdm
import random

ModuleNotFoundError: No module named 'pandas'

Reading data:

In [2]:
def get_dataset_name(file_name_with_dir):
    filename_without_dir = file_name_with_dir.split('/')[-1]
    temp = filename_without_dir.split('_')[:-1]
    dataset_name = "_".join(temp)
    return dataset_name

## Functions for data preprocessing

In [3]:
# min-max scaling
def minmax(trial):
    min = trial.min()
    max = trial.max()
    normalisedTrial = (trial - min)/(max-min)
    return normalisedTrial

#Z-score normalisation OPTIONAL
def zscore(trial):
    mean = trial.mean()
    sd = trial.std()
    normalisedTrial = (trial - mean)/sd 
    return normalisedTrial

#downsamples data by totaltimesteps/factor
def downsample(trial, factor):
    ds_trial = trial[:,::factor]
    return ds_trial



In [4]:
def preprocess_files(files = None, path = 'Final Project data/Cross/train', downsampling = 30):
    label_to_int = {'rest': 0, 'task_motor': 1, 'task_story_math': 2, 'task_working_memory': 3}

    cross_data_train = [] # Store data
    cross_data_train_labels = [] # Store labels (based on filename)

    if files == None:
        files = os.listdir(path)

    for file in files:
        file_path = f'{path}/{file}'
        
        with h5py.File(file_path, 'r') as h5_file:
            # obtain labels
            dataset_name = get_dataset_name(file_path)
            label = dataset_name.split('_')
            label.remove(label[len(label)-1])
            label = '_'.join(label)
            cross_data_train_labels.append(label_to_int[label])
            
            # obtain X_data
            matrix = h5_file.get(dataset_name)[()]
            normalisedMatrix = downsample(zscore(matrix), downsampling) # apply minmax normalisation and downsampling
            cross_data_train.append(normalisedMatrix.T) # Transpose
             
    X = torch.from_numpy(np.array(cross_data_train)).float()
    y = torch.tensor(cross_data_train_labels)        
            
    return X, y

## RNN model

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        # RNN layer
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Forward pass through RNN
        rnn, _ = self.rnn(x)
        
        # Only take the output from the final time step
        output = self.fc(rnn[:, -1, :])
        return output
    

In [6]:
# hyperparams: lr, hidden_size, downsampling
def train(path, lr = 0.001, hidden_size = 200, downsampling = 30):
    random.seed = 123 # Set seed for reproducability
    input_size = 248
    output_size = 4
    network = RNN(input_size, hidden_size, output_size)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(network.parameters(), lr=lr)

    files = os.listdir(path)
    random.shuffle(files) # Shuffle order of files
    current_samples = []
    n = 8
    batch_index = 1

    for i, file in tqdm(enumerate(files)):
        current_samples.append(file)
        if len(current_samples) == n or i == (len(files)-1):
            print(f"training batch {batch_index}...")
            X_train, y_train = preprocess_files(current_samples, downsampling=downsampling) 
            current_samples = []
            
            network.train()
            opt.zero_grad()
            output = network(X_train)
            loss = loss_fn(output, y_train)
            loss.backward()
            opt.step()
            
            batch_index += 1
    return network

In [7]:
# testing:
def test(network, paths):
    for path in paths:
        files = os.listdir(path)
        X, y = preprocess_files(files, path, 1)
        network.eval()

        test_output = network(X).detach().numpy()
        pred = np.argmax(test_output, axis=1) # to numpy
        y = y.numpy()
        return accuracy_score(pred, y)


In [8]:
path = 'Final Project data/Cross/train'
network = train(path=path)

0it [00:00, ?it/s]

training batch 1...


8it [00:03,  2.18it/s]

training batch 2...


16it [00:07,  2.00it/s]

training batch 3...


24it [00:10,  2.28it/s]

training batch 4...


32it [00:13,  2.43it/s]

training batch 5...


40it [00:16,  2.47it/s]

training batch 6...


48it [00:20,  2.49it/s]

training batch 7...


56it [00:23,  2.56it/s]

training batch 8...


64it [00:25,  2.47it/s]


In [9]:
paths = [ 'Final Project data/Cross/test1',  'Final Project data/Cross/test2',  'Final Project data/Cross/test3']
test(network=network, paths=paths)

0.8125