In [2]:
# imports
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm import tqdm
from itertools import product
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import torch.nn.utils.rnn as RNN
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42

data = pd.read_csv('data.csv', delimiter=';')
data.keys()

Index(['RECORDING_SESSION_LABEL', 'trial', 'IA_ID', 'item', 'list', 'IA_LABEL',
       'wordlength', 'condition', 'is_critical', 'is_spill1', 'is_spill2',
       'is_spill3', 'filler', 'LF', 'HF', 'function_word', 'other_filler',
       'composite', 'fixation_duration', 'duration_firstpass',
       'duration_firstfixation', 'fix_count', 'avg_pupil',
       'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT_COUNT', 'saccade_length',
       'saccade_duration', 'go_past_time', 'sentenceCondition'],
      dtype='object')

In [3]:
filtered = data.copy()

filtered["sentenceCondition"] = filtered["sentenceCondition"].map(lambda x: x.replace("none", "2"))
filtered["sentenceCondition"] = filtered["sentenceCondition"].map(lambda x: x.replace("control", "0"))
filtered["sentenceCondition"] = filtered["sentenceCondition"].map(lambda x: x.replace("pseudo", "1"))
filtered["sentenceCondition"] = filtered["sentenceCondition"].map(lambda x: x.replace("filler", "3"))

filtered['attention'] = filtered['condition'].copy()

filtered['condition'] = filtered['condition'].map(lambda x: x.replace("control", "0"))
filtered['condition'] = filtered['condition'].map(lambda x: x.replace("pseudo", "1"))
filtered['condition'] = filtered['condition'].map(lambda x: x.replace("filler", "2"))
filtered['condition'] = filtered['condition'].map(lambda x: x.replace("none", "3"))

filtered['attention'] = filtered['attention'].map(lambda x: x.replace("control", "1"))
filtered['attention'] = filtered['attention'].map(lambda x: x.replace("pseudo", "1"))
filtered['attention'] = filtered['attention'].map(lambda x: x.replace("filler", "0"))
filtered['attention'] = filtered['attention'].map(lambda x: x.replace("none", "0"))


filtered["sentenceCondition"] = filtered["sentenceCondition"].astype(int)
filtered['condition'] = filtered['condition'].astype(int)
filtered['attention'] = filtered['attention'].astype(int)

control = filtered.loc[filtered['sentenceCondition'] == 0].copy()
pseudo = filtered.loc[filtered['sentenceCondition'] == 1].copy()
mapped = pd.concat([control, pseudo])


mapped.drop(["IA_ID", "item", "list", "IA_LABEL"], axis=1, inplace=True)
normalized = mapped[['fixation_duration',
       'duration_firstpass', 'duration_firstfixation', 'fix_count',
       'avg_pupil', 'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT_COUNT',
       'saccade_length', 'saccade_duration', 'go_past_time']]
normalized = (normalized - normalized.mean()) / normalized.std()
mapped[['fixation_duration',
       'duration_firstpass', 'duration_firstfixation', 'fix_count',
       'avg_pupil', 'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT_COUNT',
       'saccade_length', 'saccade_duration', 'go_past_time']] = normalized
sentences = mapped.groupby(['RECORDING_SESSION_LABEL', 'trial'])
print(len(sentences))  

343


In [4]:
label_array = list()
features_array = list()
attention_mask_array = list()
for item in sentences:
    label_array.append(item[1]["condition"])
    attention_mask_array.append(item[1]['attention'])
    features = item[1].drop(['RECORDING_SESSION_LABEL', 'trial', 'sentenceCondition', 'condition'], axis=1).to_numpy()
    features = (features - features.mean()) / features.std()
    #print(features.shape)
    features_array.append(features)

def pad_matrix_to_same_size(lists):
    maxlen = max([len(l) for l in lists])
    return [np.concatenate((np.zeros((maxlen - l.shape[0], l.shape[1])), l), axis=0) for l in lists]

def pad_series_to_same_size(lists):
    maxlen = max([len(l) for l in lists])
    return [np.concatenate((np.zeros((maxlen - len(l))), l), axis=0) for l in lists]

lengths = np.array([len(l) for l in features_array])
padded_features_array = np.array(pad_matrix_to_same_size(features_array))
padded_attention_mask_array = np.array(pad_series_to_same_size(attention_mask_array))
padded_label_array = np.array(pad_series_to_same_size(label_array))

In [5]:
class CustomDataset(Dataset):
    def __init__(self, features, labels, attention_mask):
        self.features = features
        self.labels = labels
        self.attention_mask = attention_mask

    def __getitem__(self, index):
        features = self.features[index]
        label = self.labels[index]
        attention_mask = self.attention_mask[index]
        return features, label, attention_mask

    def __len__(self):
        return len(self.features)

dataset = CustomDataset(features=padded_features_array, labels=padded_label_array, attention_mask=padded_attention_mask_array)

In [6]:
def k_fold_split_data(dataset, batch_size, k=5):
    n = len(dataset)
    fold_size = n // k
    folds = []
    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else n
        folds.append(torch.utils.data.Subset(dataset, range(start, end)))

    dataloaders = []
    for i in range(k):
        validation_dataset = folds[i]
        train_folds = [folds[j] for j in range(k) if j != i]
        train_dataset = torch.utils.data.ConcatDataset(train_folds)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
        dataloaders.append((train_dataloader, validation_dataloader))

    return dataloaders

In [20]:
def train_test(model, dataloader, optimizer, training="train"):
   
    loss_function = torch.nn.BCEWithLogitsLoss()

    if training == "train":
        model.train()
    elif training == "validation":
        model.eval()
    elif training == "test":
        model.eval()
    else:
        raise ValueError("training argument must be either 'train', 'validation' or 'test'")
        
    cumulative_loss = 0
    prediction_list = []
    label_list = []
    sigmoid = torch.nn.Sigmoid()
    
    for sample in dataloader:
        input, targets, attention_mask = sample[0].float().to(device), sample[1].type(torch.LongTensor).to(device) , sample[2].to(device)
        output = model(input).to(device)
        predictions = output#[attention_mask == 1]

        targets = targets[attention_mask == 1]
        loss_value = loss_function(predictions, targets.unsqueeze(1).float())
        cumulative_loss += loss_value.item()

        if training == "train":
            optimizer.zero_grad()
            loss_value.sum().backward()
            optimizer.step()
        predictions = [round(x) for x in sigmoid(predictions.squeeze(1)).to('cpu').detach().numpy().tolist()]
        
        target_labels = sample[1][attention_mask.to('cpu') == 1]
        prediction_list.extend(predictions)
        label_list.extend(target_labels)

    if training == "test":
        print(confusion_matrix(label_list, prediction_list))
        return label_list, prediction_list
    f1 = f1_score(label_list, prediction_list)
    accuracy = accuracy_score(label_list, prediction_list)
    confusion = confusion_matrix(label_list, prediction_list)

    return cumulative_loss, accuracy, f1, confusion

In [25]:
class TuneableModel(torch.nn.Module):
    def __init__(self, input_size, ouput_size, kernel_size=2, stride=1, padding=0):
        super(TuneableModel, self).__init__()
        self.conv1 = torch.nn.Conv2d(input_size, ouput_size, kernel_size=kernel_size, stride=stride, padding=padding)
        self.output_layer = torch.nn.Linear(ouput_size, 1)
        # self.batchnorm = torch.nn.BatchNorm1d(layer_size)
        self.activation = torch.nn.LeakyReLU()
        # self.linear = torch.nn.Linear(layer_size, layer_size)

    def forward(self, x):
        x = self.conv1(x)
        print(x.shape)
        x = self.activation(x)
        x = self.output_layer(x)
        return x

In [16]:
# Training sample
def evaluate(params):
    dropout, hidden_size, learning_rate, batch_size, n_hidden = params

    max_epochs = 1000
    max_patience = 10
    
    predictions = []
    labels = []
    
    dataloaders = k_fold_split_data(dataset, batch_size, k=10)
    for i, dataloader in tqdm(enumerate(dataloaders)):
        train_dataloader, validation_dataloader = dataloader[0], dataloader[1]
        test_dataloader = dataloader[1]
        PATH = f"model_{i}.pt"
        last_loss = 1000000
        torch.manual_seed(seed)
        input_size = train_dataloader.dataset[0][0].shape[1]
        model = TuneableModel(batch_size, input_size, hidden_size)
        model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.99, 0.99), weight_decay=1e-2)

        for epoch in range(max_epochs):
            # training
            train_loss, train_accuracy, train_f1, train_confusion = train_test(model, train_dataloader, optimizer, training="train")
            train_loss, train_accuracy, train_f1 = train_loss, round(train_accuracy, 4), round(train_f1, 2)
            # validation at end of epoch
            validation_loss, validation_accuracy, validation_f1, validation_confusion = train_test(model, validation_dataloader, optimizer, training="validation")
            validation_loss, validation_accuracy, validation_f1 = validation_loss, round(validation_accuracy, 4), round(validation_f1, 2)
            if validation_loss < last_loss:
                last_loss = validation_loss
                current_patience = 0
            else:
                if current_patience == 0:
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': last_loss,
                        }, PATH)
                current_patience += 1
            if current_patience == max_patience:
                break   

        # Testing once patience is reached
        torch.manual_seed(seed)
        model = TuneableModel(input_size, hidden_size)
        model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.99, 0.99), weight_decay=1e-2)
        checkpoint = torch.load(PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        prediction_list, label_list = train_test(model, test_dataloader, optimizer, training="test")
        predictions.extend(prediction_list)
        labels.extend(label_list)
        
    return accuracy_score(labels, predictions), f1_score(labels, predictions), confusion_matrix(labels, predictions)


In [26]:
params = (32, 1, 0.01, 32, 1) 
accuracy, f1, confusion = evaluate(params)
print(f"acc: {round(accuracy*100,2)}%\n f1: {round(f1,3)}")
print(confusion)

0it [00:00, ?it/s]

torch.Size([22, 18, 22])





ValueError: Target size (torch.Size([32, 1])) must be the same as input size (torch.Size([22, 18, 1]))

In [None]:
params_nn ={
    'dropout': [x/10 for x in list(range(0, 10, 3))],
    'hidden_size': list(range(0, 101, 25))[1:],
    'learning_rate': [0.01, 0.001, 0.0001, 1e-05],
    'batch_size': [2*2**x for x in range(2, 6)],
    'n_hidden': list(range(1, 5, 1))
}
parameter_expansion = list(product(*params_nn.values()))
print(len(parameter_expansion))

1024


In [None]:
results = {}
for i, p in tqdm(enumerate(parameter_expansion)):
    dropout, hidden_size, learning_rate, batch_size, n_hidden = p
    accuracy, f1, confusion = evaluate(p)
    model_performance = {"dropout": dropout, "hidden_size": hidden_size, "learning_rate": learning_rate, 
              "batch_size": batch_size, "n_hidden": n_hidden, "accuracy": accuracy, "f1": f1}
    results[i] = model_performance
    print("Confusion matrix:\n", confusion)
    print(model_performance)

0it [00:00, ?it/s]



[[26  0]
 [ 2  6]]


1it [00:18, 18.93s/it]
0it [00:18, ?it/s]


KeyboardInterrupt: 

In [None]:
results_dataframe = pd.DataFrame.from_dict(results)