In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *
import pandas as pd
import numpy as np
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import Counter

In [2]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [3]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set
# Create vocabularies
# Tokenize the data
def source(df):
    source = []
    for compound in df['Neutralized SMILES']:
        # A list containing each SMILES character separated
        source.append(list(compound))
    return source
def target(df):
    target = []
    for codes in df['ATC Codes']:  
        code = convert_string_list(codes) 
        # A list of lists, each one containing each ATC code character separated 
        for c in code:
            list_c = list(c)
            target.append(list_c)
    return target

In [4]:
seeds = [42, 123, 47899, 2025, 1, 20, 99, 1020, 345, 78] 
columns = [
    'Seed', 
    'Precision', 'Recall', 'F1',
    'Precision_level3', 'Recall_level3', 'F1_level3',
    'Precision_level2', 'Recall_level2', 'F1_level2',
    'Precision level 1', 'Precision level 2', 'Precision level 3', 'Precision level 4',
    'Recall level 1', 'Recall level 2', 'Recall level 3', 'Recall level 4',
    '#Compounds that have at least one match'
]

metrics_df = pd.DataFrame(columns=columns)

for seed in seeds:
    set_seeds(seed)

    train_set = pd.read_csv(f'Datasets/train_set{seed}.csv')
    test_set = pd.read_csv(f'Datasets/test_set{seed}.csv')
    val_set = pd.read_csv(f'Datasets/val_set{seed}.csv')
    
    new_train_set = multiplicate_rows(train_set)
    new_val_set = multiplicate_rows(val_set)
    new_test_set = multiplicate_rows(test_set)
    
    source_train = source(new_train_set)
    source_test = source(new_test_set)
    # Test set without duplicated compounds
    source_test2 = source(test_set)
    source_val = source(new_val_set)
    # Val set without duplicated compounds
    source_val2 = source(val_set)

    target_train = target(new_train_set)
    target_test = target(new_test_set)
    target_val = target(new_val_set)

    # An Index object represents a mapping from the vocabulary to integers (indices) to feed into the models
    source_index = index.Index(source_train)
    target_index = index.Index(target_train)

    # Create tensors
    X_train = source_index.text2tensor(source_train)
    y_train = target_index.text2tensor(target_train)
    X_val = source_index.text2tensor(source_val)
    X_val2 = source_index.text2tensor(source_val2)
    y_val = target_index.text2tensor(target_val)     
    X_test = source_index.text2tensor(source_test)
    X_test2 = source_index.text2tensor(source_test2)
    y_test = target_index.text2tensor(target_test)

    if torch.cuda.is_available():
        X_train = X_train.to("cuda")
        y_train = y_train.to("cuda")
        X_val = X_val.to("cuda")
        X_val2 = X_val2.to("cuda")
        y_val = y_val.to("cuda")
        X_test = X_test.to("cuda")
        X_test2 = X_test2.to("cuda")
        y_test = y_test.to("cuda")

    model = BiLSTM(source_index, target_index,
         encoder_embedding_dimension = 128,
         decoder_embedding_dimension = 128,
         encoder_hidden_units = 128, 
         encoder_layers = 4,
         decoder_hidden_units = 128,
         decoder_layers = 2,
         dropout = 0.2)   
    model.to("cuda")
    model.fit(X_train, y_train,
        X_dev = X_val, 
        Y_dev = y_val, 
        batch_size = 32, 
        epochs = 150, 
        learning_rate = 0.001, 
        weight_decay = 1e-05,
        progress_bar = 0, 
        save_path = None)
    model.load_state_dict(torch.load("best_model.pth", weights_only=True))

    loss, error_rate = model.evaluate(X_test, y_test, batch_size = 32) 

    predictions, log_probabilities = search_algorithms.beam_search(
        model, 
        X_test2, # Make predictions with test set 
        predictions = 6, # max length of the predicted sequence
        beam_width = 10,
        batch_size = 32, 
        progress_bar = 0
    )
    output_beam = [target_index.tensor2text(p) for p in predictions]
    predictions_clean = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            if len(clean_pred) == 5:
                interm.append(clean_pred)
            if len(interm) == 3:
                break
        if len(interm) < 3:
            print("The model predicted less than 3 ATC codes of level 2 for a compound")
        predictions_clean.append(interm)
    predictions_clean_level3 = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_3 = clean_pred[0:4]
            if len(pred_3) == 4 and pred_3 not in interm:
                interm.append(pred_3)
        if len(interm[0:3]) < 3:
            print("The model predicted less than 3 ATC codes of level 2 for a compound")
        predictions_clean_level3.append(interm[0:3])
    predictions_clean_level2 = []
    for preds in output_beam:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_2 = clean_pred[0:3]
            if len(pred_2) == 3 and pred_2 not in interm:
                interm.append(pred_2)
        if len(interm[0:3]) < 3:
            print("The model predicted less than 3 ATC codes of level 2 for a compound")
        predictions_clean_level2.append(interm[0:3])
    precision_1, precision_2, precision_3, precision_4 = defined_metrics.precision(predictions_clean, f'Datasets/test_set{seed}.csv', 'ATC Codes')
    recall_1, recall_2, recall_3, recall_4, counter_compound_match = defined_metrics.recall(predictions_clean, f'Datasets/test_set{seed}.csv', 'ATC Codes')
    precisions, recalls, f1s = defined_metrics.complete_metrics(predictions_clean, f'Datasets/test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level3, recalls_level3, f1s_level3 = defined_metrics.complete_metrics_level3(predictions_clean_level3, f'Datasets/test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level2, recalls_level2, f1s_level2 = defined_metrics.complete_metrics_level2(predictions_clean_level2, f'Datasets/test_set{seed}.csv', 'ATC Codes', 3)
    precisions_average = sum(precisions)/len(precisions)
    recalls_average = sum(recalls)/len(recalls)
    f1s_average = sum(f1s)/len(f1s)

    precisions_average_level3 = sum(precisions_level3)/len(precisions_level3)
    recalls_average_level3 = sum(recalls_level3)/len(recalls_level3)
    f1s_average_level3 = sum(f1s_level3)/len(f1s_level3)

    precisions_average_level2 = sum(precisions_level2)/len(precisions_level2)
    recalls_average_level2 = sum(recalls_level2)/len(recalls_level2)
    f1s_average_level2 = sum(f1s_level2)/len(f1s_level2)
    
        
    metrics = {
        'Precision': precisions_average, 
        'Recall': recalls_average,
        'F1': f1s_average,
        'Precision_level3': precisions_average_level3, 
        'Recall_level3': recalls_average_level3,
        'F1_level3': f1s_average_level3,
        'Precision_level2': precisions_average_level2, 
        'Recall_level2': recalls_average_level2,
        'F1_level2': f1s_average_level2,
        'Precision level 1': precision_1,
        'Precision level 2': precision_2,
        'Precision level 3': precision_3,
        'Precision level 4': precision_4,
        'Recall level 1': recall_1,
        'Recall level 2': recall_2,
        'Recall level 3': recall_3,
        'Recall level 4': recall_4,
        '#Compounds that have at least one match': counter_compound_match
    }
    
    row = {
        'Seed': seed,
        **metrics
    }
    
    metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)

metrics_df.to_csv("bi-lstm_metrics.csv", index=False)
print("Mean:", metrics_df.mean(numeric_only=True))
print("Std:", metrics_df.std(numeric_only=True))

Model: Seq2Seq Bi-LSTM
Source index: <Seq2Seq Index with 43 items>
Target index: <Seq2Seq Index with 34 items>
Encoder embedding dimension: 128
Decoder embedding dimension: 128
Encoder hidden units: 128
Encoder layers: 4
Decoder hidden units: 128
Decoder layers: 2
Dropout: 0.2
Trainable parameters: 1,990,562

Training started
X_train.shape: torch.Size([3057, 702])
Y_train.shape: torch.Size([3057, 7])
X_dev.shape: torch.Size([546, 337])
Y_dev.shape: torch.Size([546, 7])
Epochs: 150
Learning rate: 0.001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.0914 |     57.262 |   1.5417 |     47.070 |     0.1
    2 |   1.4366 |     46.080 |   1.4045 |     47.070 |     0.2
    3 |   1.3608 |     45.687 |   1.3490 |     45.849 |     0.3
    4 |   1.3174 |     45.028 |   1.3193 |     45.299 |     0.3
    5 |   1.2872 |     44.619 |  

  metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)


Model: Seq2Seq Bi-LSTM
Source index: <Seq2Seq Index with 45 items>
Target index: <Seq2Seq Index with 34 items>
Encoder embedding dimension: 128
Decoder embedding dimension: 128
Encoder hidden units: 128
Encoder layers: 4
Decoder hidden units: 128
Decoder layers: 2
Dropout: 0.2
Trainable parameters: 1,990,818

Training started
X_train.shape: torch.Size([3048, 649])
Y_train.shape: torch.Size([3048, 7])
X_dev.shape: torch.Size([541, 337])
Y_dev.shape: torch.Size([541, 7])
Epochs: 150
Learning rate: 0.001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   2.0342 |     55.698 |   1.5194 |     46.796 |     0.1
    2 |   1.4221 |     45.976 |   1.3828 |     46.519 |     0.2
    3 |   1.3432 |     45.467 |   1.3308 |     45.225 |     0.2
    4 |   1.2998 |     44.488 |   1.2975 |     44.578 |     0.3
    5 |   1.2689 |     44.155 |  