In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../..')))
from seq2seq import *
import pandas as pd
import numpy as np
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import Counter
from transformer_hyp import hyperparametersselection

In [2]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [4]:
def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set

# Create vocabularies
# Tokenize the data
def source(df):
    source = []
    for compound in df['Neutralized SMILES']:
        # A list containing each SMILES character separated
        source.append(list(compound))
    return source
def target(df):
    target = []
    for codes in df['ATC Codes']:  
        code = convert_string_list(codes) 
        # A list of lists, each one containing each ATC code character separated 
        for c in code:
            list_c = list(c)
            target.append(list_c)
    return target

In [5]:
seeds = [42, 123, 47899, 2025, 1, 20, 99, 1020, 345, 78] 
columns = [
    'Seed', 
    'Precision', 'Recall', 'F1',
    'Precision_level3', 'Recall_level3', 'F1_level3',
    'Precision_level2', 'Recall_level2', 'F1_level2',
    'Precision level 1', 'Precision level 2', 'Precision level 3', 'Precision level 4',
    'Recall level 1', 'Recall level 2', 'Recall level 3', 'Recall level 4',
    '#Compounds that have at least one match'
]
metrics_df = pd.DataFrame(columns=columns)

for seed in seeds:
    set_seeds(seed)

    train_set = pd.read_csv(f'../Datasets/Rep_train_set{seed}.csv')
    test_set = pd.read_csv(f'../Datasets/Rep_test_set{seed}.csv')
    val_set = pd.read_csv(f'../Datasets/Rep_val_set{seed}.csv')
    
    new_train_set = multiplicate_rows(train_set)
    new_val_set = multiplicate_rows(val_set)
    new_test_set = multiplicate_rows(test_set)
    
    source_train = source(new_train_set)
    source_test = source(new_test_set)
    # Test set without duplicated compounds
    source_test2 = source(test_set)
    source_val = source(new_val_set)
    # Val set without duplicated compounds
    source_val2 = source(val_set)
    
    target_train = target(new_train_set)
    target_test = target(new_test_set)
    target_val = target(new_val_set)
    
    # An Index object represents a mapping from the vocabulary to integers (indices) to feed into the models
    source_index = index.Index(source_train)
    target_index = index.Index(target_train)
    
    # Create tensors
    X_train = source_index.text2tensor(source_train)
    y_train = target_index.text2tensor(target_train)
    X_val = source_index.text2tensor(source_val)
    X_val2 = source_index.text2tensor(source_val2)
    y_val = target_index.text2tensor(target_val)     
    X_test = source_index.text2tensor(source_test)
    X_test2 = source_index.text2tensor(source_test2)
    y_test = target_index.text2tensor(target_test)

    if torch.cuda.is_available():
        X_train = X_train.to("cuda")
        y_train = y_train.to("cuda")
        X_val = X_val.to("cuda")
        X_val2 = X_val2.to("cuda")
        y_val = y_val.to("cuda")
        X_test= X_test.to("cuda")
        y_test = y_test.to("cuda")
        X_test2 = X_test2.to("cuda")

    if os.path.exists(f"sortedtransformer_results{seed}.csv"):
        best_hyperparameters = (pd.read_csv(f"sortedtransformer_results{seed}.csv")).loc[0]
    else:
        best_hyperparameters = hyperparametersselection(seed, source_index, target_index, X_train, X_val, X_val2, y_train, y_val)

    model = models.Transformer(
                source_index, 
                target_index,
                max_sequence_length = 800,
                embedding_dimension = best_hyperparameters['embedding_dim'],
                feedforward_dimension = best_hyperparameters['feedforward_dim'],
                encoder_layers = best_hyperparameters['enc_layers'],
                decoder_layers = best_hyperparameters['dec_layers'],
                attention_heads = best_hyperparameters['attention_heads'],
                activation = "relu",
                dropout = best_hyperparameters['dropout'])   
    model.to("cuda")
    q = model.fit(X_train,
            y_train,
            X_val, 
            y_val, 
            batch_size = 32, 
            epochs = 150, 
            learning_rate = best_hyperparameters['learning_rate'], 
            weight_decay = best_hyperparameters['weight_decay'],
            progress_bar = 0, 
            save_path = None)
    model.load_state_dict(torch.load("best_model.pth", weights_only=True))
    loss, error_rate = model.evaluate(X_test, y_test, batch_size = 32) 

    predictions, log_probabilities = search_algorithms.beam_search(
        model, 
        X_test2, # Make predictions with test set 
        predictions = 6, # max length of the predicted sequence
        beam_width = 10,
        batch_size = 32, 
        progress_bar = 0
    )
    output_beam = [target_index.tensor2text(p) for p in predictions]

    predictions_clean = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            if len(clean_pred) == 5:
                interm.append(clean_pred)
            if len(interm) == 3:
                break
        predictions_clean.append(interm)
    predictions_clean_level3 = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_3 = clean_pred[0:4]
            if len(pred_3) == 4 and pred_3 not in interm:
                interm.append(pred_3)
        predictions_clean_level3.append(interm[0:3])
    predictions_clean_level2 = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_2 = clean_pred[0:3]
            if len(pred_2) == 3 and pred_2 not in interm:
                interm.append(pred_2)
        predictions_clean_level2.append(interm[0:3])
    precision_1, precision_2, precision_3, precision_4 = defined_metrics.precision(predictions_clean, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes')
    recall_1, recall_2, recall_3, recall_4, counter_compound_match = defined_metrics.recall(predictions_clean, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes')
    precisions, recalls, f1s = defined_metrics.complete_metrics(predictions_clean, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level3, recalls_level3, f1s_level3 = defined_metrics.complete_metrics_level3(predictions_clean_level3, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level2, recalls_level2, f1s_level2 = defined_metrics.complete_metrics_level2(predictions_clean_level2, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes', 3)
    precisions_average = sum(precisions)/len(precisions)
    recalls_average = sum(recalls)/len(recalls)
    f1s_average = sum(f1s)/len(f1s)

    precisions_average_level3 = sum(precisions_level3)/len(precisions_level3)
    recalls_average_level3 = sum(recalls_level3)/len(recalls_level3)
    f1s_average_level3 = sum(f1s_level3)/len(f1s_level3)

    precisions_average_level2 = sum(precisions_level2)/len(precisions_level2)
    recalls_average_level2 = sum(recalls_level2)/len(recalls_level2)
    f1s_average_level2 = sum(f1s_level2)/len(f1s_level2)
    
    metrics = {
        'Precision': precisions_average, 
        'Recall': recalls_average,
        'F1': f1s_average,
        'Precision_level3': precisions_average_level3, 
        'Recall_level3': recalls_average_level3,
        'F1_level3': f1s_average_level3,
        'Precision_level2': precisions_average_level2, 
        'Recall_level2': recalls_average_level2,
        'F1_level2': f1s_average_level2,
        'Precision level 1': precision_1,
        'Precision level 2': precision_2,
        'Precision level 3': precision_3,
        'Precision level 4': precision_4,
        'Recall level 1': recall_1,
        'Recall level 2': recall_2,
        'Recall level 3': recall_3,
        'Recall level 4': recall_4,
        '#Compounds that have at least one match': counter_compound_match
    }
    
    # Build the row
    row = {
        'Seed': seed,
        **metrics
    }
    
    metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)

    torch.cuda.empty_cache()

metrics_df.to_csv("transformer_metrics.csv", index=False)
print("Mean:", metrics_df.mean(numeric_only=True))
print("Std:", metrics_df.std(numeric_only=True))



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 128
Feedforward dimension: 256
Encoder layers: 4
Decoder layers: 4
Attention heads: 2
Activation: relu
Dropout: 0.2
Trainable parameters: 1,442,594

Training started
X_train.shape: torch.Size([3024, 702])
Y_train.shape: torch.Size([3024, 7])
X_dev.shape: torch.Size([538, 295])
Y_dev.shape: torch.Size([538, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.3765 |     60.163 |   1.6398 |     46.685 |     0.2
    2 |   1.6195 |     46.131 |   1.4512 |     46.623 |     0.3
    3 |   1.4788 |     45.982 |   1.3806 |     45.477 |     0.5
    4 |   1.4168 |     45.260 |   1.3593 |     46.035 |     0.6
    5 |   1.3745 |     44.681 |  

  metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)


Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 128
Feedforward dimension: 128
Encoder layers: 2
Decoder layers: 3
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 814,242

Training started
X_train.shape: torch.Size([3028, 649])
Y_train.shape: torch.Size([3028, 7])
X_dev.shape: torch.Size([534, 702])
Y_dev.shape: torch.Size([534, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.0334 |     52.642 |   1.4939 |     44.881 |     0.1
    2 |   1.4188 |     44.479 |   1.3570 |     42.884 |     0.2
    3 |   1.3251 |     43.142 |   1.2977 |     42.322 |     0.2
    4 |   1.2705 |     41.810 |   1.2539 |     40.543 |     0.3
    5 |   1.2255 |     40.858 |   1



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 44 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 256
Encoder layers: 2
Decoder layers: 4
Attention heads: 2
Activation: relu
Dropout: 0.1
Trainable parameters: 425,634

Training started
X_train.shape: torch.Size([3021, 702])
Y_train.shape: torch.Size([3021, 7])
X_dev.shape: torch.Size([541, 250])
Y_dev.shape: torch.Size([541, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.5259 |     62.049 |   1.8240 |     47.320 |     0.1
    2 |   1.7517 |     46.762 |   1.5627 |     45.163 |     0.2
    3 |   1.5718 |     45.917 |   1.4774 |     45.194 |     0.3
    4 |   1.4906 |     45.675 |   1.4224 |     44.331 |     0.4
    5 |   1.4364 |     44.842 |   1.



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 43 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 128
Encoder layers: 2
Decoder layers: 3
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 276,258

Training started
X_train.shape: torch.Size([3042, 702])
Y_train.shape: torch.Size([3042, 7])
X_dev.shape: torch.Size([520, 467])
Y_dev.shape: torch.Size([520, 7])
Epochs: 150
Learning rate: 0.001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.6402 |     48.181 |   1.3738 |     45.000 |     0.1
    2 |   1.2752 |     43.464 |   1.2082 |     41.923 |     0.1
    3 |   1.1793 |     40.812 |   1.1491 |     40.224 |     0.2
    4 |   1.1425 |     39.875 |   1.1168 |     39.423 |     0.3
    5 |   1.0946 |     37.957 |   1.0



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 256
Encoder layers: 4
Decoder layers: 3
Attention heads: 4
Activation: relu
Dropout: 0.2
Trainable parameters: 458,978

Training started
X_train.shape: torch.Size([3027, 702])
Y_train.shape: torch.Size([3027, 7])
X_dev.shape: torch.Size([535, 258])
Y_dev.shape: torch.Size([535, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.7107 |     67.994 |   2.0475 |     48.941 |     0.2
    2 |   1.9222 |     47.379 |   1.6386 |     45.763 |     0.3
    3 |   1.6492 |     46.074 |   1.5268 |     45.763 |     0.5
    4 |   1.5526 |     46.008 |   1.4677 |     45.670 |     0.7
    5 |   1.4882 |     45.937 |   1.



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 128
Feedforward dimension: 128
Encoder layers: 4
Decoder layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.2
Trainable parameters: 847,522

Training started
X_train.shape: torch.Size([3025, 702])
Y_train.shape: torch.Size([3025, 7])
X_dev.shape: torch.Size([537, 467])
Y_dev.shape: torch.Size([537, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.4433 |     60.804 |   1.6248 |     46.555 |     0.1
    2 |   1.6361 |     46.551 |   1.4181 |     46.307 |     0.3
    3 |   1.4745 |     45.895 |   1.3612 |     45.593 |     0.5
    4 |   1.4046 |     44.860 |   1.3345 |     45.717 |     0.6
    5 |   1.3651 |     44.320 |   



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 128
Encoder layers: 2
Decoder layers: 2
Attention heads: 4
Activation: relu
Dropout: 0.1
Trainable parameters: 226,146

Training started
X_train.shape: torch.Size([3038, 702])
Y_train.shape: torch.Size([3038, 7])
X_dev.shape: torch.Size([524, 221])
Y_dev.shape: torch.Size([524, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.7829 |     67.506 |   2.1118 |     49.173 |     0.1
    2 |   1.9416 |     47.548 |   1.6609 |     45.420 |     0.2
    3 |   1.6554 |     46.023 |   1.5258 |     45.324 |     0.3
    4 |   1.5446 |     45.726 |   1.4566 |     45.134 |     0.3
    5 |   1.4791 |     45.397 |   1.



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 33 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 128
Encoder layers: 4
Decoder layers: 3
Attention heads: 2
Activation: relu
Dropout: 0.1
Trainable parameters: 343,265

Training started
X_train.shape: torch.Size([3025, 702])
Y_train.shape: torch.Size([3025, 7])
X_dev.shape: torch.Size([537, 337])
Y_dev.shape: torch.Size([537, 7])
Epochs: 150
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.8138 |     50.766 |   1.3650 |     46.772 |     0.1
    2 |   1.3592 |     45.758 |   1.3010 |     45.531 |     0.2
    3 |   1.3019 |     44.893 |   1.2459 |     45.903 |     0.3
    4 |   1.2574 |     43.983 |   1.2275 |     44.351 |     0.4
    5 |   1.2313 |     43.642 |   1.



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 33 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 256
Encoder layers: 2
Decoder layers: 3
Attention heads: 2
Activation: relu
Dropout: 0.1
Trainable parameters: 358,817

Training started
X_train.shape: torch.Size([3033, 702])
Y_train.shape: torch.Size([3033, 7])
X_dev.shape: torch.Size([529, 267])
Y_dev.shape: torch.Size([529, 7])
Epochs: 150
Learning rate: 0.001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.7195 |     49.407 |   1.3499 |     43.415 |     0.1
    2 |   1.3197 |     44.258 |   1.2441 |     41.178 |     0.1
    3 |   1.2383 |     42.598 |   1.2120 |     39.698 |     0.2
    4 |   1.1915 |     41.499 |   1.1839 |     40.044 |     0.3
    5 |   1.1556 |     39.927 |   1.1



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 2
Attention heads: 2
Activation: relu
Dropout: 0.0
Trainable parameters: 193,186

Training started
X_train.shape: torch.Size([3017, 702])
Y_train.shape: torch.Size([3017, 7])
X_dev.shape: torch.Size([545, 323])
Y_dev.shape: torch.Size([545, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.6873 |     62.269 |   2.0957 |     46.239 |     0.1
    2 |   1.8402 |     46.210 |   1.6796 |     45.963 |     0.1
    3 |   1.5992 |     45.884 |   1.5375 |     44.801 |     0.2
    4 |   1.4967 |     45.139 |   1.4608 |     44.709 |     0.2
    5 |   1.4336 |     44.316 |   1.4