### Hi :)

In [1]:
# imports
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm import tqdm
from itertools import product
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 42

In [2]:
data = pd.read_csv('data.csv', delimiter=';')


In [3]:
data.keys()

Index(['RECORDING_SESSION_LABEL', 'trial', 'IA_ID', 'item', 'list', 'IA_LABEL',
       'wordlength', 'condition', 'is_critical', 'is_spill1', 'is_spill2',
       'is_spill3', 'filler', 'LF', 'HF', 'function_word', 'other_filler',
       'composite', 'fixation_duration', 'duration_firstpass',
       'duration_firstfixation', 'fix_count', 'avg_pupil',
       'IA_REGRESSION_IN_COUNT', 'IA_REGRESSION_OUT_COUNT', 'saccade_length',
       'saccade_duration', 'go_past_time', 'sentenceCondition'],
      dtype='object')

In [4]:


filtered = data.loc[data['is_critical'] == 1]

dropped = filtered.copy()
dropped.drop(["RECORDING_SESSION_LABEL", "trial", "IA_ID", "item", "list", "IA_LABEL", "wordlength", "is_critical", 
              'is_spill1', 'is_spill2', 'is_spill3', 'filler', 'LF', 'HF', 'function_word', 'other_filler', "composite"], axis=1, inplace=True)
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("none", "0"))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("control", "0"))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("pseudo", "1"))
dropped[["condition", "sentenceCondition"]] = dropped[["condition", "sentenceCondition"]].map(lambda x: x.replace("filler", "0"))

labels = dropped["condition"].copy()
labels = labels.astype('int')
features = dropped.copy().drop(["condition", "sentenceCondition"], axis=1)
features = features.astype('float')
features = (features - features.mean()) / features.std()

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        features = self.features.iloc[index].to_numpy()
        label = self.labels.iloc[index]
        return features, label

    def __len__(self):
        return len(self.features)

dataset = CustomDataset(features=features, labels=labels)
input_size = len(features.columns)
print(input_size)

10


In [5]:
print(features.keys())

Index(['fixation_duration', 'duration_firstpass', 'duration_firstfixation',
       'fix_count', 'avg_pupil', 'IA_REGRESSION_IN_COUNT',
       'IA_REGRESSION_OUT_COUNT', 'saccade_length', 'saccade_duration',
       'go_past_time'],
      dtype='object')


In [6]:
def split_data(dataset, batch_size):
    
    generator = torch.Generator().manual_seed(42)
    train_dataset, validation_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1], generator=generator)
    #train_dataset, validation_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1])

    global class_weights
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y.numpy())
    class_weights = torch.tensor(class_weights, dtype=torch.float)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_dataloader, validation_dataloader, test_dataloader

In [7]:
def train_test(model, dataloader, optimizer, training="train"):
   
    loss_function = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))

    if training == "train":
        model.train()
    elif training == "validation":
        model.eval()
    elif training == "test":
        model.eval()
    else:
        raise ValueError("training argument must be either 'train', 'validation' or 'test'")
        
    total = 0
    correct = 0
    cumulative_loss = 0
    prediction_list = []
    label_list = []
    for sample in dataloader:
        data, targets = sample[0].float().to(device), sample[1].type(torch.LongTensor).to(device)
        output = model(data)
        loss_value = loss_function(output, targets)
        cumulative_loss += loss_value.item()

        if training == "train":
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            
        predictions = output.to('cpu').detach().numpy().argmax(axis=1)
        target_labels = targets.to('cpu').detach().numpy()
        total += len(predictions)
        correct += accuracy_score(target_labels, predictions, normalize=False)
        prediction_list.extend(predictions)
        label_list.extend(target_labels)  
    f1 = f1_score(label_list, prediction_list)
    accuracy = accuracy_score(label_list, prediction_list)

    return cumulative_loss, accuracy, f1

In [8]:
class TuneableModel(torch.nn.Module):
    def __init__(self, layer_size, dropout_rate, n_layers):
        super(TuneableModel, self).__init__()
        self.n_layers = n_layers
        self.input_layer = torch.nn.Linear(input_size, layer_size)
        self.linear1 = torch.nn.Linear(layer_size, layer_size)
        self.linear2 = torch.nn.Linear(layer_size, layer_size)
        self.linear3 = torch.nn.Linear(layer_size, layer_size)
        self.linear4 = torch.nn.Linear(layer_size, layer_size)
        self.output_layer = torch.nn.Linear(layer_size, 2)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.activation = torch.nn.Sigmoid()
        self.batchnorm = torch.nn.BatchNorm1d(layer_size)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.batchnorm(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.linear1(x)
        x = self.activation(x)
        x = self.dropout(x)
        if self.n_layers > 1:
            x = self.linear2(x)
            x = self.activation(x)
            x = self.dropout(x)
            if self.n_layers > 2:
                x = self.linear3(x)
                x = self.activation(x)
                x = self.dropout(x)
                if self.n_layers > 3:
                    x = self.linear4(x)
                    x = self.activation(x)
                    x = self.dropout(x)
        x = self.output_layer(x)
        return x

In [9]:
# Training sample
def evaluate(params):
    dropout, hidden_size, learning_rate, batch_size, n_hidden = params

    number_of_folds = 5
    max_epochs = 1000
    max_patience = 20

    accuracies = []
    f1s = []
    
    PATH = "model.pt"

    for i in range(number_of_folds):

        train_dataloader, validation_dataloader, test_dataloader = split_data(dataset, batch_size)
        last_loss = 1000000
        torch.manual_seed(seed)
        model = TuneableModel(hidden_size, dropout, n_hidden)
        model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

        for epoch in range(max_epochs):
            # training
            train_loss, train_accuracy, train_f1 = train_test(model, train_dataloader, optimizer, training="train")
            train_loss, train_accuracy, train_f1 = round(train_loss, 2), round(train_accuracy, 4), round(train_f1, 2)
            # validation at end of epoch
            validation_loss, validation_accuracy, validation_f1 = train_test(model, validation_dataloader, optimizer, training="validation")
            validation_loss, validation_accuracy, validation_f1 = round(validation_loss, 2), round(validation_accuracy, 4), round(validation_f1, 2)
            if validation_loss < last_loss:
                last_loss = validation_loss
                current_patience = 0
            else:
                if current_patience == 0:
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': last_loss,
                        }, PATH)
                current_patience += 1
            if current_patience == max_patience:
                #print(f"Early stopping at epoch {epoch}")
                break   
            #if epoch % 100 == 0 and epoch != 0:
                #print(f"Epoch {epoch}\nvalidation loss: {round(validation_loss, 2)}\nvalidation accuracy: {validation_accuracy*100}%\nvalidation f1: {validation_f1}\n")

        # Testing once patience is reached
        torch.manual_seed(seed)
        model = TuneableModel(hidden_size, dropout, n_hidden)
        model.to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        checkpoint = torch.load(PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        test_loss, test_accuracy, test_f1 = train_test(model, test_dataloader, optimizer, training="test")
        test_loss, test_accuracy, test_f1 = round(test_loss, 2), round(test_accuracy, 4), round(test_f1, 2)
        print(f"Model {i} at epoch {checkpoint['epoch']} test results: accuracy: {test_accuracy*100}% f1: {test_f1}")
        accuracies.append(test_accuracy)
        f1s.append(test_f1)

    return round(np.mean(accuracies), 2), round(np.mean(f1s), 2)
    # print(f"Average accuracy: {round(np.mean(accuracies), 2)}%")
    # print(f"Average f1: {round(np.mean(f1s), 2)}")


In [11]:
best_params = (0.5, 100, 0.0002, 32, 3)
accuracy, f1 = evaluate(best_params)
print("Accuracy and f1 for best parameters: ", accuracy, f1)
print("Best parameters: ", best_params)

  from .autonotebook import tqdm as notebook_tqdm


Model 0 at epoch 137 test results: accuracy: 94.12% f1: 0.89


KeyboardInterrupt: 

In [None]:
# params_nn ={
#     'dropout': [0.5],
#     'hidden_size': list(range(500, 501, 100)),
#     'learning_rate':[0.01, 0.001, 0.0001, 0.00001],
#     'batch_size':[8, 16, 32, 64, 128],
#     'n_hidden': list(range(1, 4, 1))
# }
# parameter_expansion = list(product(*params_nn.values()))
# print(len(parameter_expansion))

60


In [None]:
# results = {}
# for i, p in tqdm(enumerate(parameter_expansion)):
#     dropout, hidden_size, learning_rate, batch_size, n_hidden = p
#     accuracy, f1 = evaluate(p)
#     model_performance = {"dropout": dropout, "hidden_size": hidden_size, "learning_rate": learning_rate, 
#               "batch_size": batch_size, "n_hidden": n_hidden, "accuracy": accuracy, "f1": f1}
#     results[i] = model_performance
#     print(model_performance)


In [None]:
# results_dataframe = pd.DataFrame.from_dict(results, orient='index')
# # save results to file
# results_dataframe.to_csv("preliminary_results.csv", index=False)