In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *
import pandas as pd
import numpy as np
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import Counter

In [2]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [3]:
df = pd.read_csv('../../Data/splittedATC.csv')

In [4]:
X = df['Neutralized SMILES']
y = df['ATC Codes']

In [5]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def create_partitions(df, seed):
    # Create a new column that indicates if the compound has more than 1 ATC code associated (1) or not (0)
    df['multiple_ATC'] = df['ATC Codes'].apply(lambda x: len(convert_string_list(x)) > 1)
    
    # Divide the dataset depending on multiple_ATC column
    group_more_than_one = df[df['multiple_ATC']]  # Compounds with more than one ATC code associated
    group_one = df[~df['multiple_ATC']]          # Compounds with just one ATC code associated

    conteo_longitudes = Counter(len(convert_string_list(codes)) for codes in group_more_than_one['ATC Codes'])
    group_more_than_one = group_more_than_one.reset_index(drop=True)
    group_one = group_one.reset_index(drop=True)

    # Divide each set into train, validation and test subsets
    train_more, test_more = train_test_split(group_more_than_one, test_size=0.2, random_state=seed)
    train_one, test_one = train_test_split(group_one, test_size=0.2, random_state=seed)
    train_more, val_more = train_test_split(train_more, test_size=0.15, random_state=seed)
    train_one, val_one = train_test_split(train_one, test_size=0.15, random_state=seed)
    
    # Combine each set
    train_set = pd.concat([train_more, train_one])
    test_set = pd.concat([test_more, test_one])
    val_set = pd.concat([val_more, val_one])
    train_set = shuffle(train_set, random_state = seed)
    test_set = shuffle(test_set, random_state = seed)
    val_set = shuffle(val_set, random_state = seed)
    return train_set, val_set, test_set

def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set
# Create vocabularies
# Tokenize the data
def source(df):
    source = []
    for compound in df['Neutralized SMILES']:
        # A list containing each SMILES character separated
        source.append(list(compound))
    return source
def target(df):
    target = []
    for codes in df['ATC Codes']:  
        code = convert_string_list(codes) 
        # A list of lists, each one containing each ATC code character separated 
        for c in code:
            list_c = list(c)
            target.append(list_c)
    return target

In [6]:
set_seeds(78)
train_set = pd.read_csv(f'train_set.csv')
test_set = pd.read_csv(f'test_set.csv')
val_set = pd.read_csv(f'val_set.csv')
    
new_train_set = multiplicate_rows(train_set)
new_val_set = multiplicate_rows(val_set)
new_test_set = multiplicate_rows(test_set)

source_train = source(new_train_set)
source_test = source(new_test_set)
# Test set without duplicated compounds
source_test2 = source(test_set)
source_val = source(new_val_set)
# Val set without duplicated compounds
source_val2 = source(val_set)

target_train = target(new_train_set)
target_test = target(new_test_set)
target_val = target(new_val_set)

# An Index object represents a mapping from the vocabulary to integers (indices) to feed into the models
source_index = index.Index(source_train)
target_index = index.Index(target_train)

# Create tensors
X_train = source_index.text2tensor(source_train)
y_train = target_index.text2tensor(target_train)
X_val = source_index.text2tensor(source_val)
X_val2 = source_index.text2tensor(source_val2)
y_val = target_index.text2tensor(target_val)     
X_test = source_index.text2tensor(source_test)
X_test2 = source_index.text2tensor(source_test2)
y_test = target_index.text2tensor(target_test)

if torch.cuda.is_available():
    X_train = X_train.to("cuda")
    y_train = y_train.to("cuda")
    X_val = X_val.to("cuda")
    X_val2 = X_val2.to("cuda")
    y_val = y_val.to("cuda")
    X_test = X_test.to("cuda")
    X_test2 = X_test2.to("cuda")
    y_test = y_test.to("cuda")

model = models.Transformer(source_index, target_index,
     max_sequence_length = 800,
     embedding_dimension = 64,
     feedforward_dimension = 64,
     encoder_layers = 2,
     decoder_layers = 4,
     attention_heads = 4,
     activation = "relu",
     dropout = 0.0)   
model.to("cuda")
model.fit(X_train, y_train,
        X_dev = X_val, 
        Y_dev = y_val, 
        batch_size = 32, 
        epochs = 500, 
        learning_rate = 0.001, 
        weight_decay = 0.0001,
        progress_bar = 0, 
        save_path = None) 
model.load_state_dict(torch.load("best_model.pth", weights_only=True))
pickle.dump(model, open('modelTransformer.pkl','wb'))
loss, error_rate = model.evaluate(X_test, y_test, batch_size = 32) 

predictions, log_probabilities = search_algorithms.beam_search(
    model, 
    X_test2, # Make predictions with test set 
    predictions = 6, # max length of the predicted sequence
    beam_width = 10,
    batch_size = 32, 
    progress_bar = 0
)
output_beam = [target_index.tensor2text(p) for p in predictions]
predictions_clean = []
for preds in output_beam:
    interm = []
    for pred in preds:
        clean_pred = pred.replace('<START>', '').replace('<END>', '')
        if len(clean_pred) == 5:
            interm.append(clean_pred)
    if len(interm) < 10:
        print("The model predicted less than 10 ATC codes of level 4 for a compound")
    predictions_clean.append(interm)



Model: Seq2Seq Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 64
Feedforward dimension: 64
Encoder layers: 2
Decoder layers: 4
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 277,154

Training started
X_train.shape: torch.Size([3057, 702])
Y_train.shape: torch.Size([3057, 7])
X_dev.shape: torch.Size([521, 307])
Y_dev.shape: torch.Size([521, 7])
Epochs: 500
Learning rate: 0.001
Weight decay: 0.0001
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.5919 |     47.018 |   1.3230 |     43.122 |     0.1
    2 |   1.2618 |     42.367 |   1.2248 |     41.779 |     0.2
    3 |   1.1717 |     39.794 |   1.1497 |     40.083 |     0.3
    4 |   1.1132 |     38.044 |   1.1290 |     38.836 |     0.4
    5 |   1.0653 |     36.681 |   1.0

In [7]:
precisions1, recalls1, f1s1 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 1)
precisions2, recalls2, f1s2 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 2)
precisions3, recalls3, f1s3 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 3)
precisions4, recalls4, f1s4 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 4)
precisions5, recalls5, f1s5 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 5)
precisions6, recalls6, f1s6 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 6)
precisions7, recalls7, f1s7 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 7)
precisions8, recalls8, f1s8 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 8)
precisions9, recalls9, f1s9 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 9)
precisions10, recalls10, f1s10 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 10)

precisions_average1 = sum(precisions1)/len(precisions1)
recalls_average1 = sum(recalls1)/len(recalls1)
f1s_average1 = sum(f1s1)/len(f1s1)

precisions_average2 = sum(precisions2)/len(precisions2)
recalls_average2 = sum(recalls2)/len(recalls2)
f1s_average2 = sum(f1s2)/len(f1s2)

precisions_average3 = sum(precisions3)/len(precisions3)
recalls_average3 = sum(recalls3)/len(recalls3)
f1s_average3 = sum(f1s3)/len(f1s3)

precisions_average4 = sum(precisions4)/len(precisions4)
recalls_average4 = sum(recalls4)/len(recalls4)
f1s_average4 = sum(f1s4)/len(f1s4)

precisions_average5 = sum(precisions5)/len(precisions5)
recalls_average5 = sum(recalls5)/len(recalls5)
f1s_average5 = sum(f1s5)/len(f1s5)

precisions_average6 = sum(precisions6)/len(precisions6)
recalls_average6 = sum(recalls6)/len(recalls6)
f1s_average6 = sum(f1s6)/len(f1s6)

precisions_average7 = sum(precisions7)/len(precisions7)
recalls_average7 = sum(recalls7)/len(recalls7)
f1s_average7 = sum(f1s7)/len(f1s7)

precisions_average8 = sum(precisions8)/len(precisions8)
recalls_average8 = sum(recalls8)/len(recalls8)
f1s_average8 = sum(f1s8)/len(f1s8)

precisions_average9 = sum(precisions9)/len(precisions9)
recalls_average9 = sum(recalls9)/len(recalls9)
f1s_average9 = sum(f1s9)/len(f1s9)

precisions_average10 = sum(precisions10)/len(precisions10)
recalls_average10 = sum(recalls10)/len(recalls10)
f1s_average10 = sum(f1s10)/len(f1s10)

In [8]:
precisions_average_k = []
precisions_average_k.append(precisions_average1)
precisions_average_k.append(precisions_average2)
precisions_average_k.append(precisions_average3)
precisions_average_k.append(precisions_average4)
precisions_average_k.append(precisions_average5)
precisions_average_k.append(precisions_average6)
precisions_average_k.append(precisions_average7)
precisions_average_k.append(precisions_average8)
precisions_average_k.append(precisions_average9)
precisions_average_k.append(precisions_average10)
recalls_average_k = []
recalls_average_k.append(recalls_average1)
recalls_average_k.append(recalls_average2)
recalls_average_k.append(recalls_average3)
recalls_average_k.append(recalls_average4)
recalls_average_k.append(recalls_average5)
recalls_average_k.append(recalls_average6)
recalls_average_k.append(recalls_average7)
recalls_average_k.append(recalls_average8)
recalls_average_k.append(recalls_average9)
recalls_average_k.append(recalls_average10)
f1s_average_k = []
f1s_average_k.append(f1s_average1)
f1s_average_k.append(f1s_average2)
f1s_average_k.append(f1s_average3)
f1s_average_k.append(f1s_average4)
f1s_average_k.append(f1s_average5)
f1s_average_k.append(f1s_average6)
f1s_average_k.append(f1s_average7)
f1s_average_k.append(f1s_average8)
f1s_average_k.append(f1s_average9)
f1s_average_k.append(f1s_average10)

In [9]:
df_results = pd.read_csv("df_metrics.csv")
new_rows = pd.DataFrame(columns=["model", "precision", "recall", "f1"])
new_rows["precision"] = precisions_average_k
new_rows["recall"] = recalls_average_k
new_rows["f1"] = f1s_average_k
new_rows["model"] = "Transformer"
df_results = pd.concat([df_results , new_rows], ignore_index=True)
df_results.to_csv("df_metrics.csv", index= False)