### Hi :)

In [3]:
# imports
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm import tqdm
from itertools import product
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42

data = pd.read_csv('data.csv', delimiter=';')
data.keys()
torch.manual_seed(seed)

<torch._C.Generator at 0x1f2767b2bf0>

In [4]:

dropped = data.loc[data['is_critical'] == 1].copy()
dropped.drop(['composite', 'LF', 'HF', "RECORDING_SESSION_LABEL", "trial", "IA_ID", "item", "list", "IA_LABEL", "wordlength", "is_critical", 
              'is_spill1', 'is_spill2', 'is_spill3', 'filler', 'function_word', 'other_filler'], axis=1, inplace=True)
print(len(dropped))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("none", "0"))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("control", "0"))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("pseudo", "1"))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("filler", "0"))

normalized = dropped[['fixation_duration',
       'duration_firstpass', 'duration_firstfixation', 'fix_count',
       'avg_pupil', 'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT_COUNT',
       'saccade_length', 'saccade_duration', 'go_past_time']]
normalized = (normalized - normalized.mean()) / normalized.std()
dropped[['fixation_duration',
       'duration_firstpass', 'duration_firstfixation', 'fix_count',
       'avg_pupil', 'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT_COUNT',
       'saccade_length', 'saccade_duration', 'go_past_time']] = normalized


labels = dropped["condition"].copy()
labels = labels.astype('int')
#dropped[['LF', 'HF']] = dropped[['LF', 'HF']].astype('int')

features = dropped.copy().drop(["condition", "sentenceCondition"], axis=1)

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        features = self.features.iloc[index].to_numpy()
        label = self.labels.iloc[index]
        return features, label

    def __len__(self):
        return len(self.features)

dataset = CustomDataset(features=features, labels=labels)
input_size = len(features.columns)

343


In [5]:
def k_fold_split_data(dataset, batch_size, k=5):
    n = len(dataset)
    fold_size = n // k
    folds = []
    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else n
        folds.append(torch.utils.data.Subset(dataset, range(start, end)))

    dataloaders = []
    for i in range(k):
        validation_dataset = folds[i]
        train_folds = [folds[j] for j in range(k) if j != i]
        train_dataset = torch.utils.data.ConcatDataset(train_folds)

        y = torch.tensor([label for _, label in train_dataset], dtype=torch.long)

        global class_weights
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y.numpy())
        class_weights = torch.tensor(class_weights, dtype=torch.float)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
        dataloaders.append((train_dataloader, validation_dataloader))

    return dataloaders

In [6]:
def train_test(model, dataloader, optimizer, training="train"):
   
    loss_function = torch.nn.BCEWithLogitsLoss()#weight=class_weights.to(device))

    if training == "train":
        model.train()
    elif training == "validation":
        model.eval()
    elif training == "test":
        model.eval()
    else:
        raise ValueError("training argument must be either 'train', 'validation' or 'test'")
        
    total = 0
    correct = 0
    cumulative_loss = 0
    prediction_list = []
    label_list = []
    sigmoid = torch.nn.Sigmoid()
    for sample in dataloader:
   
        data, targets = sample[0].float().to(device), sample[1].type(torch.LongTensor).to(device)
        output = model(data)
        loss_value = loss_function(output, targets.unsqueeze(1).float())
        cumulative_loss += loss_value.item()

        if training == "train":
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
        
        predictions = [round(x) for x in sigmoid(output).to('cpu').detach().squeeze(1).numpy().tolist()]#.argmax(axis=1)
        target_labels = targets.to('cpu').detach().numpy()
        total += len(predictions)
        correct += accuracy_score(target_labels, predictions, normalize=False)
        prediction_list.extend(predictions)
        label_list.extend(target_labels)
    if training == "test":
        return label_list, prediction_list
    f1 = f1_score(label_list, prediction_list)
    accuracy = accuracy_score(label_list, prediction_list)
    confusion = confusion_matrix(label_list, prediction_list)

    return cumulative_loss, accuracy, f1, confusion

In [7]:
class TuneableModel(torch.nn.Module):
    def __init__(self, layer_size, dropout_rate, n_layers):
        super(TuneableModel, self).__init__()
        self.n_layers = n_layers
        self.input_layer = torch.nn.Linear(input_size, layer_size)
        self.linear2 = torch.nn.Linear(layer_size, layer_size)
        self.linear3 = torch.nn.Linear(layer_size, layer_size)
        self.linear4 = torch.nn.Linear(layer_size, layer_size)
        self.linear5 = torch.nn.Linear(layer_size, layer_size)
        self.linear6 = torch.nn.Linear(layer_size, layer_size)
        self.linear7 = torch.nn.Linear(layer_size, layer_size)
        self.linear8 = torch.nn.Linear(layer_size, layer_size)
        self.linear9 = torch.nn.Linear(layer_size, layer_size)
        self.linear10 = torch.nn.Linear(layer_size, layer_size)
        self.output_layer = torch.nn.Linear(layer_size, 1)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.activation = torch.nn.LeakyReLU()
        self.batchnorm = torch.nn.BatchNorm1d(layer_size)

    def forward(self, x):
        x = self.input_layer(x)
        #x = self.batchnorm(x)
        x = self.activation(x)
        x = self.dropout(x)
        if self.n_layers > 1:
            x = self.linear2(x)
            x = self.activation(x)
            x = self.dropout(x)
            if self.n_layers > 2:
                x = self.linear3(x)
                x = self.activation(x)
                x = self.dropout(x)
                if self.n_layers > 3:
                    x = self.linear4(x)
                    x = self.activation(x)
                    x = self.dropout(x)
                    if self.n_layers > 4:
                        x = self.linear5(x)
                        x = self.activation(x)
                        x = self.dropout(x)
                        if self.n_layers > 5:
                            x = self.linear6(x)
                            x = self.activation(x)
                            x = self.dropout(x)
                            if self.n_layers > 6:
                                x = self.linear7(x)
                                x = self.activation(x)
                                x = self.dropout(x)
                                if self.n_layers > 7:
                                    x = self.linear8(x)
                                    x = self.activation(x)
                                    x = self.dropout(x)
                                    if self.n_layers > 8:
                                        x = self.linear9(x)
                                        x = self.activation(x)
                                        x = self.dropout(x)
                                        if self.n_layers > 9:
                                            x = self.linear10(x)
                                            x = self.activation(x)
                                            x = self.dropout(x)
        x = self.output_layer(x)
        #x = self.activation(x)
        return x

In [18]:
# Training sample
def evaluate(params):
    dropout, hidden_size, learning_rate, batch_size, n_hidden = params

    max_epochs = 1000
    max_patience = 10

    best_epochs = []
    predictions = []
    labels = []
    torch.manual_seed(seed)
    model = TuneableModel(hidden_size, dropout, n_hidden)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.99, 0.99), weight_decay=1e-2)

    dataloaders = k_fold_split_data(dataset, batch_size, k=10)
    for i, dataloader in tqdm(enumerate(dataloaders)):
        last_loss = 1000000
        best_epoch = 0
        PATH = f"model_{i}.pt"
        train_dataloader, validation_dataloader = dataloader[0], dataloader[1]
        test_dataloader = dataloader[1]
        for epoch in range(max_epochs):
            # training
            train_loss, train_accuracy, train_f1, train_confusion = train_test(model, train_dataloader, optimizer, training="train")
            train_loss, train_accuracy, train_f1 = round(train_loss, 2), round(train_accuracy, 4), round(train_f1, 2)
            # validation at end of epoch
            validation_loss, validation_accuracy, validation_f1, validation_confusion = train_test(model, validation_dataloader, optimizer, training="validation")
            validation_loss, validation_accuracy, validation_f1 = round(validation_loss, 2), round(validation_accuracy, 4), round(validation_f1, 2)
            if validation_loss < last_loss:
                last_loss = validation_loss
                best_epoch = epoch
                current_patience = 0
            else:
                if current_patience == 0:
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': last_loss,
                        }, PATH)
                current_patience += 1
            if current_patience == max_patience:
                break   
            # if epoch % 100 == 0 and epoch != 0:
            #     print(f"Epoch {epoch}: Train loss: {train_loss}, Train accuracy: {train_accuracy}, Train f1: {train_f1}")
            #     print(f"Epoch {epoch}: Validation loss: {validation_loss}, Validation accuracy: {validation_accuracy}, Validation f1: {validation_f1}")

        # Testing once patience is reached
        torch.manual_seed(seed)
        model = TuneableModel(hidden_size, dropout, n_hidden)
        model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.99, 0.99), weight_decay=1e-4)
        checkpoint = torch.load(PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        prediction_list, label_list = train_test(model, test_dataloader, optimizer, training="test")
        predictions.extend(prediction_list)
        labels.extend(label_list)
        best_epochs.append(best_epoch)
    print("Average training epochs for best model:", round(np.mean(best_epochs), 1))
    print("Best epochs:\n", best_epochs)
    return accuracy_score(labels, predictions), f1_score(labels, predictions), confusion_matrix(labels, predictions)
    # print(f"Average accuracy: {round(np.mean(accuracies), 2)}%")
    # print(f"Average f1: {round(np.mean(f1s), 2)}")


In [17]:
"""
Patience = 6
Average training epochs for best model: 1.8
Best epochs:
    [4, 7, 3, 3, 1, 0, 0, 0, 0, 0]
    
# 10-fold accuracy 97.67% f1 0.953 

Cumulative 10-fold confusion:
    [[254   4]
    [  4  81]]

@ params(
    dropout: 0.0
    layer size: 500
    lr: 0.001
    batch_size: 32
    n_layers: 5)
@ AdamW(
    betas=(0.99, 0.99), 
    weight_decay=1e-2), 
"""

params = (0.0, 500, 0.001, 32, 5) 
accuracy, f1, confusion = evaluate(params)
print(f"acc: {round(accuracy*100,2)}%\n f1: {round(f1,3)}")
print(confusion)

10it [00:10,  1.04s/it]

Average training epochs for best model: 1.8
Best epochs:
 [4, 7, 3, 3, 1, 0, 0, 0, 0, 0]
acc: 97.67%
 f1: 0.953
[[254   4]
 [  4  81]]





In [10]:
# params_nn ={
#     'dropout': [0.5],
#     'hidden_size': list(range(500, 501, 100)),
#     'learning_rate':[0.01, 0.001, 0.0001, 0.00001],
#     'batch_size':[8, 16, 32, 64, 128],
#     'n_hidden': list(range(1, 4, 1))
# }
# parameter_expansion = list(product(*params_nn.values()))
# print(len(parameter_expansion))

In [11]:
# results = {}
# for i, p in tqdm(enumerate(parameter_expansion)):
#     dropout, hidden_size, learning_rate, batch_size, n_hidden = p
#     accuracy, f1 = evaluate(p)
#     model_performance = {"dropout": dropout, "hidden_size": hidden_size, "learning_rate": learning_rate, 
#               "batch_size": batch_size, "n_hidden": n_hidden, "accuracy": accuracy, "f1": f1}
#     results[i] = model_performance
#     print(model_performance)


In [12]:
# results_dataframe = pd.DataFrame.from_dict(results, orient='index')
# # save results to file
# results_dataframe.to_csv("preliminary_results.csv", index=False)