In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *
import pandas as pd
import numpy as np
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import Counter

In [2]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [5]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set
# Create vocabularies
# Tokenize the data
def source(df):
    source = []
    for compound in df['Neutralized SMILES']:
        # A list containing each SMILES character separated
        source.append(list(compound))
    return source
def target(df):
    target = []
    for codes in df['ATC Codes']:  
        code = convert_string_list(codes) 
        # A list of lists, each one containing each ATC code character separated 
        for c in code:
            list_c = list(c)
            target.append(list_c)
    return target

In [6]:
seeds = [42, 123, 47899, 2025, 1, 20, 99, 1020, 345, 78] 
columns = [
    'Seed', 
    'Precision', 'Recall', 'F1',
    'Precision_level3', 'Recall_level3', 'F1_level3',
    'Precision_level2', 'Recall_level2', 'F1_level2',
    'Precision level 1', 'Precision level 2', 'Precision level 3', 'Precision level 4',
    'Recall level 1', 'Recall level 2', 'Recall level 3', 'Recall level 4',
    '#Compounds that have at least one match'
]

metrics_df = pd.DataFrame(columns=columns)

for seed in seeds:
    set_seeds(seed)

    train_set = pd.read_csv(f'Datasets/train_set{seed}.csv')
    test_set = pd.read_csv(f'Datasets/test_set{seed}.csv')
    val_set = pd.read_csv(f'Datasets/val_set{seed}.csv')
    
    new_train_set = multiplicate_rows(train_set)
    new_val_set = multiplicate_rows(val_set)
    new_test_set = multiplicate_rows(test_set)
    
    source_train = source(new_train_set)
    source_test = source(new_test_set)
    # Test set without duplicated compounds
    source_test2 = source(test_set)
    source_val = source(new_val_set)
    # Val set without duplicated compounds
    source_val2 = source(val_set)

    target_train = target(new_train_set)
    target_test = target(new_test_set)
    target_val = target(new_val_set)

    # An Index object represents a mapping from the vocabulary to integers (indices) to feed into the models
    source_index = index.Index(source_train)
    target_index = index.Index(target_train)

    # Create tensors
    X_train = source_index.text2tensor(source_train)
    y_train = target_index.text2tensor(target_train)
    X_val = source_index.text2tensor(source_val)
    X_val2 = source_index.text2tensor(source_val2)
    y_val = target_index.text2tensor(target_val)     
    X_test = source_index.text2tensor(source_test)
    X_test2 = source_index.text2tensor(source_test2)
    y_test = target_index.text2tensor(target_test)

    if torch.cuda.is_available():
        X_train = X_train.to("cuda")
        y_train = y_train.to("cuda")
        X_val = X_val.to("cuda")
        X_val2 = X_val2.to("cuda")
        y_val = y_val.to("cuda")
        X_test = X_test.to("cuda")
        X_test2 = X_test2.to("cuda")
        y_test = y_test.to("cuda")

    model = models.Transformer(source_index, target_index,
         max_sequence_length = 800,
         embedding_dimension = 64,
         feedforward_dimension = 64,
         encoder_layers = 2,
         decoder_layers = 4,
         attention_heads = 4,
         activation = "relu",
         dropout = 0.0)   
    model.to("cuda")
    model.fit(X_train, y_train,
            X_dev = X_val, 
            Y_dev = y_val, 
            batch_size = 32, 
            epochs = 500, 
            learning_rate = 0.001, 
            weight_decay = 0.0001,
            progress_bar = 0, 
            save_path = None) 
    model.load_state_dict(torch.load("best_model.pth", weights_only=True))

    loss, error_rate = model.evaluate(X_test, y_test, batch_size = 32) 

    predictions, log_probabilities = search_algorithms.beam_search(
        model, 
        X_test2, # Make predictions with test set 
        predictions = 6, # max length of the predicted sequence
        beam_width = 10,
        batch_size = 32, 
        progress_bar = 0
    )
    output_beam = [target_index.tensor2text(p) for p in predictions]
    predictions_clean = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            if len(clean_pred) == 5:
                interm.append(clean_pred)
            if len(interm) == 3:
                break
        if len(interm) < 3:
            print("The model predicted less than 3 ATC codes of level 2 for a compound")
        predictions_clean.append(interm)
    predictions_clean_level3 = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_3 = clean_pred[0:4]
            if len(pred_3) == 4 and pred_3 not in interm:
                interm.append(pred_3)
        if len(interm[0:3]) < 3:
            print("The model predicted less than 3 ATC codes of level 3 for a compound")
        predictions_clean_level3.append(interm[0:3])
    predictions_clean_level2 = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_2 = clean_pred[0:3]
            if len(pred_2) == 3 and pred_2 not in interm:
                interm.append(pred_2)
        if len(interm[0:3]) < 3:
            print("The model predicted less than 3 ATC codes of level 2 for a compound")
        predictions_clean_level2.append(interm[0:3])
    precision_1, precision_2, precision_3, precision_4 = defined_metrics.precision(predictions_clean, f'Datasets/test_set{seed}.csv', 'ATC Codes')
    recall_1, recall_2, recall_3, recall_4, counter_compound_match = defined_metrics.recall(predictions_clean, f'Datasets/test_set{seed}.csv', 'ATC Codes')
    precisions, recalls, f1s = defined_metrics.complete_metrics(predictions_clean, f'Datasets/test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level3, recalls_level3, f1s_level3 = defined_metrics.complete_metrics_level3(predictions_clean_level3, f'Datasets/test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level2, recalls_level2, f1s_level2 = defined_metrics.complete_metrics_level2(predictions_clean_level2, f'Datasets/test_set{seed}.csv', 'ATC Codes', 3)
    precisions_average = sum(precisions)/len(precisions)
    recalls_average = sum(recalls)/len(recalls)
    f1s_average = sum(f1s)/len(f1s)

    precisions_average_level3 = sum(precisions_level3)/len(precisions_level3)
    recalls_average_level3 = sum(recalls_level3)/len(recalls_level3)
    f1s_average_level3 = sum(f1s_level3)/len(f1s_level3)

    precisions_average_level2 = sum(precisions_level2)/len(precisions_level2)
    recalls_average_level2 = sum(recalls_level2)/len(recalls_level2)
    f1s_average_level2 = sum(f1s_level2)/len(f1s_level2)
    
        
    metrics = {
        'Precision': precisions_average, 
        'Recall': recalls_average,
        'F1': f1s_average,
        'Precision_level3': precisions_average_level3, 
        'Recall_level3': recalls_average_level3,
        'F1_level3': f1s_average_level3,
        'Precision_level2': precisions_average_level2, 
        'Recall_level2': recalls_average_level2,
        'F1_level2': f1s_average_level2,
        'Precision level 1': precision_1,
        'Precision level 2': precision_2,
        'Precision level 3': precision_3,
        'Precision level 4': precision_4,
        'Recall level 1': recall_1,
        'Recall level 2': recall_2,
        'Recall level 3': recall_3,
        'Recall level 4': recall_4,
        '#Compounds that have at least one match': counter_compound_match,
    }
    
    # Build the row
    row = {
        'Seed': seed,
        **metrics
    }
    
    metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)

metrics_df.to_csv("transformer_metrics.csv", index=False)
print("Mean:", metrics_df.mean(numeric_only=True))
print("Std:", metrics_df.std(numeric_only=True))



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 43 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 276,962

Training started
X_train.shape: torch.Size([3057, 702])
Y_train.shape: torch.Size([3057, 7])
X_dev.shape: torch.Size([546, 337])
Y_dev.shape: torch.Size([546, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.6731 |     48.375 |   1.3960 |     46.398 |     0.1
    2 |   1.2753 |     43.310 |   1.2301 |     41.972 |     0.2
    3 |   1.1858 |     40.699 |   1.1682 |     40.446 |     0.3
    4 |   1.1118 |     38.660 |   1.1379 |     39.683 |     0.4
    5 |   1.0521 |     36.196 |   1.0

  metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)


Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,090

Training started
X_train.shape: torch.Size([3048, 649])
Y_train.shape: torch.Size([3048, 7])
X_dev.shape: torch.Size([541, 337])
Y_dev.shape: torch.Size([541, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.6909 |     49.021 |   1.3490 |     44.640 |     0.1
    2 |   1.2867 |     43.138 |   1.2458 |     42.360 |     0.2
    3 |   1.1975 |     41.175 |   1.1743 |     41.189 |     0.3
    4 |   1.1258 |     39.157 |   1.1707 |     39.710 |     0.4
    5 |   1.0782 |     37.522 |   1.0



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 44 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,026

Training started
X_train.shape: torch.Size([3064, 649])
Y_train.shape: torch.Size([3064, 7])
X_dev.shape: torch.Size([541, 702])
Y_dev.shape: torch.Size([541, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.6557 |     48.896 |   1.3230 |     44.670 |     0.1
    2 |   1.2764 |     43.315 |   1.2361 |     42.606 |     0.2
    3 |   1.1922 |     41.460 |   1.1755 |     39.988 |     0.3
    4 |   1.1281 |     39.056 |   1.1226 |     38.725 |     0.4
    5 |   1.0705 |     37.195 |   1.0



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 43 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 276,962

Training started
X_train.shape: torch.Size([3072, 702])
Y_train.shape: torch.Size([3072, 7])
X_dev.shape: torch.Size([566, 649])
Y_dev.shape: torch.Size([566, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.6535 |     47.716 |   1.3639 |     44.700 |     0.1
    2 |   1.2709 |     43.056 |   1.2462 |     42.874 |     0.2
    3 |   1.1869 |     41.054 |   1.1774 |     40.724 |     0.3
    4 |   1.1187 |     38.129 |   1.1440 |     38.722 |     0.4
    5 |   1.0647 |     36.637 |   1.0



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,090

Training started
X_train.shape: torch.Size([3034, 702])
Y_train.shape: torch.Size([3034, 7])
X_dev.shape: torch.Size([542, 350])
Y_dev.shape: torch.Size([542, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.6647 |     48.088 |   1.3416 |     45.972 |     0.1
    2 |   1.2800 |     43.034 |   1.2551 |     44.280 |     0.2
    3 |   1.1869 |     40.870 |   1.1944 |     42.589 |     0.3
    4 |   1.1195 |     39.008 |   1.2081 |     41.667 |     0.4
    5 |   1.0675 |     36.877 |   1.1



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,090

Training started
X_train.shape: torch.Size([3059, 350])
Y_train.shape: torch.Size([3059, 7])
X_dev.shape: torch.Size([553, 649])
Y_dev.shape: torch.Size([553, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.7053 |     48.709 |   1.3526 |     44.334 |     0.1
    2 |   1.2723 |     42.786 |   1.2706 |     43.369 |     0.1
    3 |   1.1738 |     40.247 |   1.1996 |     41.561 |     0.2
    4 |   1.1051 |     37.823 |   1.1478 |     39.964 |     0.2
    5 |   1.0487 |     36.139 |   1.1



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,090

Training started
X_train.shape: torch.Size([3063, 649])
Y_train.shape: torch.Size([3063, 7])
X_dev.shape: torch.Size([558, 702])
Y_dev.shape: torch.Size([558, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.7457 |     50.272 |   1.3225 |     43.429 |     0.1
    2 |   1.2758 |     43.421 |   1.2171 |     41.338 |     0.2
    3 |   1.1899 |     41.512 |   1.1540 |     40.054 |     0.3
    4 |   1.1275 |     39.275 |   1.1418 |     40.711 |     0.4
    5 |   1.0733 |     37.839 |   1.0



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 44 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,026

Training started
X_train.shape: torch.Size([3052, 702])
Y_train.shape: torch.Size([3052, 7])
X_dev.shape: torch.Size([533, 266])
Y_dev.shape: torch.Size([533, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.7505 |     49.645 |   1.3126 |     43.496 |     0.1
    2 |   1.2733 |     43.021 |   1.2244 |     42.652 |     0.2
    3 |   1.1841 |     40.673 |   1.1890 |     41.432 |     0.3
    4 |   1.1223 |     38.658 |   1.1038 |     37.774 |     0.4
    5 |   1.0632 |     36.708 |   1.0



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 43 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 276,962

Training started
X_train.shape: torch.Size([3085, 702])
Y_train.shape: torch.Size([3085, 7])
X_dev.shape: torch.Size([537, 649])
Y_dev.shape: torch.Size([537, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.7061 |     48.709 |   1.3598 |     45.624 |     0.1
    2 |   1.2894 |     44.084 |   1.2499 |     43.824 |     0.2
    3 |   1.2037 |     42.134 |   1.1698 |     40.223 |     0.3
    4 |   1.1463 |     39.800 |   1.1520 |     39.634 |     0.4
    5 |   1.1013 |     38.763 |   1.1



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,154

Training started
X_train.shape: torch.Size([3057, 702])
Y_train.shape: torch.Size([3057, 7])
X_dev.shape: torch.Size([521, 307])
Y_dev.shape: torch.Size([521, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.5921 |     47.023 |   1.3232 |     42.546 |     0.1
    2 |   1.2616 |     42.471 |   1.2241 |     41.107 |     0.2
    3 |   1.1709 |     39.712 |   1.1447 |     40.243 |     0.3
    4 |   1.1135 |     38.109 |   1.1279 |     38.868 |     0.4
    5 |   1.0644 |     36.828 |   1.0