In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.utils import shuffle
from collections import Counter
import pickle
import torch
from sklearn.preprocessing import StandardScaler
from torch import Tensor

In [2]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [4]:
def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set

def extract_descriptors(df):
    """
    Extract molecular descriptors from your dataset.
    You'll need to implement this based on your descriptor source.
    
    Returns FloatTensor of shape (n_molecules, descriptor_dimension)
    """
    descriptors = df.iloc[:, 2:-5].values
    # Convert to numpy for easier handling
    if isinstance(descriptors, torch.Tensor):
        desc_array = descriptors.numpy()
    else:
        desc_array = np.array(descriptors)
    # Replace infinite values with NaN first
    desc_array[np.isinf(desc_array)] = np.nan
    
    # Calculate median for each feature (column-wise)
    medians = np.nanmedian(desc_array, axis=0)
    
    # Replace NaN values with corresponding median
    for i in range(desc_array.shape[1]):
        mask = np.isnan(desc_array[:, i])
        desc_array[mask, i] = medians[i]
    return torch.tensor(desc_array, dtype=torch.float32)
    
# Create vocabularies
# Tokenize the data
def source(df):
    source = []
    for compound in df['Neutralized SMILES']:
        # A list containing each SMILES character separated
        source.append(list(compound))
    return source
def target(df):
    target = []
    for codes in df['ATC Codes']:  
        code = convert_string_list(codes) 
        # A list of lists, each one containing each ATC code character separated 
        for c in code:
            list_c = list(c)
            target.append(list_c)
    return target

In [5]:
set_seeds(78)
train_set = pd.read_csv(f'train_set.csv')
test_set = pd.read_csv(f'test_set.csv')
val_set = pd.read_csv(f'val_set.csv')

new_train_set = multiplicate_rows(train_set)
new_val_set = multiplicate_rows(val_set)
new_test_set = multiplicate_rows(test_set)

train_descriptors = extract_descriptors(new_train_set)
test_descriptors = extract_descriptors(new_test_set)
test_descriptors2 = extract_descriptors(test_set)
val_descriptors = extract_descriptors(new_val_set)
val_descriptors2 = extract_descriptors(val_set)

scaler = StandardScaler()
train_descriptors = torch.tensor(scaler.fit_transform(train_descriptors.numpy()), dtype=torch.float32)
val_descriptors = torch.tensor(scaler.transform(val_descriptors.numpy()), dtype=torch.float32)
test_descriptors = torch.tensor(scaler.transform(test_descriptors.numpy()), dtype=torch.float32)
val_descriptors2 = torch.tensor(scaler.transform(val_descriptors2.numpy()), dtype=torch.float32)
test_descriptors2 = torch.tensor(scaler.transform(test_descriptors2.numpy()), dtype=torch.float32)

source_train = source(new_train_set)
source_test = source(new_test_set)
# Test set without duplicated compounds
source_test2 = source(test_set)
source_val = source(new_val_set)
# Val set without duplicated compounds
source_val2 = source(val_set)

target_train = target(new_train_set)
target_test = target(new_test_set)
target_val = target(new_val_set)

# An Index object represents a mapping from the vocabulary to integers (indices) to feed into the models
source_index = index.Index(source_train)
target_index = index.Index(target_train)

# Create tensors
X_train = source_index.text2tensor(source_train)
y_train = target_index.text2tensor(target_train)
X_val = source_index.text2tensor(source_val)
X_val2 = source_index.text2tensor(source_val2)
y_val = target_index.text2tensor(target_val)     
X_test = source_index.text2tensor(source_test)
X_test2 = source_index.text2tensor(source_test2)
y_test = target_index.text2tensor(target_test)

if torch.cuda.is_available():
    X_train = X_train.to("cuda")
    y_train = y_train.to("cuda")
    train_descriptors = train_descriptors.to("cuda") 
    test_descriptors = test_descriptors.to("cuda")
    test_descriptors2 = test_descriptors2.to("cuda")
    val_descriptors = val_descriptors.to("cuda")
    val_descriptors2 = val_descriptors2.to("cuda")
    X_val = X_val.to("cuda")
    X_val2 = X_val2.to("cuda")
    y_val = y_val.to("cuda")
    X_test= X_test.to("cuda")
    y_test = y_test.to("cuda")
    X_test2 = X_test2.to("cuda")

model = multimodal_models.MultimodalTransformer(
            source_index, 
            target_index,
            max_sequence_length = 800,
            embedding_dimension = 128,
            descriptors_dimension=train_descriptors.shape[1],
            feedforward_dimension = 256,
            encoder_layers = 4,
            decoder_layers = 3,
            attention_heads = 4,
            activation = "relu",
            dropout = 0.0)   
model.to("cuda")
q = model.fit(X_train, 
        train_descriptors,
        y_train,
        X_val, 
        val_descriptors,
        y_val, 
        batch_size = 32, 
        epochs = 150, 
        learning_rate = 0.0001, 
        weight_decay = 1e-05,
        progress_bar = 0, 
        save_path = None)
model.load_state_dict(torch.load("best_multimodalmodel.pth", weights_only=True))
pickle.dump(model, open('modelMultimodalTransformer.pkl','wb'))
loss, error_rate = model.evaluate(X_test, test_descriptors, y_test, batch_size = 32) 

predictions, log_probabilities = search_algorithms.multimodal_beam_search(
    model, 
    X_test2, 
    test_descriptors2,
    predictions = 6, # max length of the predicted sequence
    beam_width = 10,
    batch_size = 32, 
    progress_bar = 0
)
output_beam = [target_index.tensor2text(p) for p in predictions]
predictions_clean = []
for preds in output_beam:
    interm = []
    for pred in preds:
        clean_pred = pred.replace('<START>', '').replace('<END>', '')
        if len(clean_pred) == 5:
            interm.append(clean_pred)
    if len(interm) < 10:
        print("The model predicted less than 10 ATC codes of level 4 for a compound")
    predictions_clean.append(interm)



Model: Seq2Seq Multimodal Transformer
Source index: <Seq2Seq Index with 46 items>
Target index: <Seq2Seq Index with 34 items>
Max sequence length: 800
Embedding dimension: 128
Descriptors dimension: 1136
Feedforward dimension: 256
Encoder layers: 4
Decoder layers: 3
Attention heads: 4
Activation: relu
Dropout: 0.0
Trainable parameters: 1,405,730

Training started
X_train.shape: torch.Size([3057, 702])
Y_train.shape: torch.Size([3057, 7])
X_dev.shape: torch.Size([521, 307])
Y_dev.shape: torch.Size([521, 7])
Epochs: 150
Learning rate: 0.0001
Weight decay: 1e-05
Epoch | Train                 | Development           | Minutes
      | Loss     | Error Rate | Loss     | Error Rate |
---------------------------------------------------------------
    1 |   1.9756 |     51.674 |   1.4534 |     43.250 |     0.2
    2 |   1.3371 |     41.260 |   1.2817 |     39.507 |     0.4
    3 |   1.1841 |     36.223 |   1.1819 |     37.332 |     0.6
    4 |   1.0768 |     33.426 |   1.1037 |     35.285 |   

In [6]:
precisions1, recalls1, f1s1 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 1)
precisions2, recalls2, f1s2 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 2)
precisions3, recalls3, f1s3 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 3)
precisions4, recalls4, f1s4 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 4)
precisions5, recalls5, f1s5 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 5)
precisions6, recalls6, f1s6 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 6)
precisions7, recalls7, f1s7 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 7)
precisions8, recalls8, f1s8 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 8)
precisions9, recalls9, f1s9 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 9)
precisions10, recalls10, f1s10 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 10)

precisions_average1 = sum(precisions1)/len(precisions1)
recalls_average1 = sum(recalls1)/len(recalls1)
f1s_average1 = sum(f1s1)/len(f1s1)

precisions_average2 = sum(precisions2)/len(precisions2)
recalls_average2 = sum(recalls2)/len(recalls2)
f1s_average2 = sum(f1s2)/len(f1s2)

precisions_average3 = sum(precisions3)/len(precisions3)
recalls_average3 = sum(recalls3)/len(recalls3)
f1s_average3 = sum(f1s3)/len(f1s3)

precisions_average4 = sum(precisions4)/len(precisions4)
recalls_average4 = sum(recalls4)/len(recalls4)
f1s_average4 = sum(f1s4)/len(f1s4)

precisions_average5 = sum(precisions5)/len(precisions5)
recalls_average5 = sum(recalls5)/len(recalls5)
f1s_average5 = sum(f1s5)/len(f1s5)

precisions_average6 = sum(precisions6)/len(precisions6)
recalls_average6 = sum(recalls6)/len(recalls6)
f1s_average6 = sum(f1s6)/len(f1s6)

precisions_average7 = sum(precisions7)/len(precisions7)
recalls_average7 = sum(recalls7)/len(recalls7)
f1s_average7 = sum(f1s7)/len(f1s7)

precisions_average8 = sum(precisions8)/len(precisions8)
recalls_average8 = sum(recalls8)/len(recalls8)
f1s_average8 = sum(f1s8)/len(f1s8)

precisions_average9 = sum(precisions9)/len(precisions9)
recalls_average9 = sum(recalls9)/len(recalls9)
f1s_average9 = sum(f1s9)/len(f1s9)

precisions_average10 = sum(precisions10)/len(precisions10)
recalls_average10 = sum(recalls10)/len(recalls10)
f1s_average10 = sum(f1s10)/len(f1s10)

In [7]:
precisions_average_k = []
precisions_average_k.append(precisions_average1)
precisions_average_k.append(precisions_average2)
precisions_average_k.append(precisions_average3)
precisions_average_k.append(precisions_average4)
precisions_average_k.append(precisions_average5)
precisions_average_k.append(precisions_average6)
precisions_average_k.append(precisions_average7)
precisions_average_k.append(precisions_average8)
precisions_average_k.append(precisions_average9)
precisions_average_k.append(precisions_average10)
recalls_average_k = []
recalls_average_k.append(recalls_average1)
recalls_average_k.append(recalls_average2)
recalls_average_k.append(recalls_average3)
recalls_average_k.append(recalls_average4)
recalls_average_k.append(recalls_average5)
recalls_average_k.append(recalls_average6)
recalls_average_k.append(recalls_average7)
recalls_average_k.append(recalls_average8)
recalls_average_k.append(recalls_average9)
recalls_average_k.append(recalls_average10)
f1s_average_k = []
f1s_average_k.append(f1s_average1)
f1s_average_k.append(f1s_average2)
f1s_average_k.append(f1s_average3)
f1s_average_k.append(f1s_average4)
f1s_average_k.append(f1s_average5)
f1s_average_k.append(f1s_average6)
f1s_average_k.append(f1s_average7)
f1s_average_k.append(f1s_average8)
f1s_average_k.append(f1s_average9)
f1s_average_k.append(f1s_average10)

In [8]:
df_results = pd.read_csv("df_metrics.csv")
new_rows = pd.DataFrame(columns=["model", "precision", "recall", "f1"])
new_rows["precision"] = precisions_average_k
new_rows["recall"] = recalls_average_k
new_rows["f1"] = f1s_average_k
new_rows["model"] = "Multimodal Transformer"
df_results = pd.concat([df_results , new_rows], ignore_index=True)
df_results.to_csv("df_metrics.csv", index= False)