In [1]:
import pickle
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *
import pandas as pd
import numpy as np
import torch
import random
import itertools
from sklearn.preprocessing import StandardScaler

In [2]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [4]:
train_set = pd.read_csv('../../Data/train_set.csv')
test_set = pd.read_csv('../../Data/test_set.csv')
val_set = pd.read_csv('../../Data/val_set.csv')
set_seeds(78)

In [5]:
def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set
    
# Create vocabularies
# Tokenize the data
def source(df):
    source = []
    for compound in df['Neutralized SMILES']:
        # A list containing each SMILES character separated
        source.append(list(compound))
    return source
def target(df):
    target = []
    for codes in df['ATC Codes']:  
        code = convert_string_list(codes) 
        # A list of lists, each one containing each ATC code character separated 
        for c in code:
            list_c = list(c)
            target.append(list_c)
    return target

In [6]:
new_train_set = multiplicate_rows(train_set)
new_val_set = multiplicate_rows(val_set)
new_test_set = multiplicate_rows(test_set)

new_test_set.to_csv("onecodeperdrug_test_set.csv", index = False)
new_val_set.to_csv("onecodeperdrug_val_set.csv", index = False)

source_train = source(new_train_set)
source_test = source(new_test_set)
# Test set without duplicated compounds
source_test2 = source(test_set)
source_val = source(new_val_set)
# Val set without duplicated compounds
source_val2 = source(val_set)

target_train = target(new_train_set)
target_test = target(new_test_set)
target_val = target(new_val_set)

# An Index object represents a mapping from the vocabulary to integers (indices) to feed into the models
source_index = index.Index(source_train)
target_index = index.Index(target_train)

# Create tensors
X_train = source_index.text2tensor(source_train)
y_train = target_index.text2tensor(target_train)
X_val = source_index.text2tensor(source_val)
X_val2 = source_index.text2tensor(source_val2)
y_val = target_index.text2tensor(target_val)     
X_test = source_index.text2tensor(source_test)
X_test2 = source_index.text2tensor(source_test2)
y_test = target_index.text2tensor(target_test)

if torch.cuda.is_available():
    X_train = X_train.to("cuda")
    y_train = y_train.to("cuda")
    X_val = X_val.to("cuda")
    X_val2 = X_val2.to("cuda")
    y_val = y_val.to("cuda")
    X_test= X_test.to("cuda")
    y_test = y_test.to("cuda")
    X_test2 = X_test2.to("cuda")

In [7]:
hyperparameters_grid = { 
    'embedding_dim': [32, 64, 128],
    'feedforward_dim': [64, 128, 256],
    'enc_layers': [2, 3, 4],
    'dec_layers': [2, 3, 4],
    'attention_heads': [2, 4],
    'dropout': [0.0, 0.1, 0.2],
    'weight_decays': [10**-4, 10**-5],
    'learning_rates': [10**-3, 10**-4]
}


In [8]:
# Randomly sample from dictionary
random_params = {k: random.sample(v, 1)[0] for k, v in hyperparameters_grid.items()}
print(random_params['embedding_dim'])

32


In [9]:
def random_search(max_evals):
    tested_params = set()
    df_tests = pd.DataFrame(columns = ['#epochs', 'embedding_dim', 'feedforward_dim', 'enc_layers', 'dec_layers', 'attention_heads', 'dropout', 'weight_decay', 'learning_rate', 'Precisionatn nivel1', 'Precisionatn nivel2', 'Precisionatn nivel3', 'Precisionatn nivel4', 'Precision nivel1', 'Precision nivel2', 'Precision nivel3', 'Precision nivel4', 'Recall nivel1', 'Recall nivel2', 'Recall nivel3', 'Recall nivel4', 'Drugs that have at least one match'], index = list(range(max_evals)))
    sys.stdout = open('log.txt', 'w')
    for i in range(max_evals):
        while True:
            random_params = {k: random.sample(v, 1)[0] for k, v in hyperparameters_grid.items()}
            params_tuple = tuple(random_params.values())
            if params_tuple not in tested_params:
                tested_params.add(params_tuple)
                break   
        model = models.Transformer(
                 source_index, 
                 target_index,
                 max_sequence_length = 800,
                 embedding_dimension = random_params['embedding_dim'],
                 feedforward_dimension = random_params['feedforward_dim'],
                 encoder_layers = random_params['enc_layers'],
                 decoder_layers = random_params['dec_layers'],
                 attention_heads = random_params['attention_heads'],
                 activation = "relu",
                 dropout = random_params['dropout'])   
        model.to("cuda")
        model.fit(
                X_train,
                y_train,
                X_val, 
                y_val, 
                batch_size = 32, 
                epochs = 500, 
                learning_rate = random_params['learning_rates'], 
                weight_decay = random_params['weight_decays'],
                progress_bar = 0, 
                save_path = None
        ) 
        model.load_state_dict(torch.load("best_model.pth", weights_only=True))
        ep = model.early_stopping.best_epoch
        loss, error_rate = model.evaluate(X_val, y_val)    
        predictions, log_probabilities = search_algorithms.beam_search(
            model, 
            X_val,
            predictions = 6, # max length of the predicted sequence
            beam_width = 3,
            batch_size = 32, 
            progress_bar = 0
        )
        output_beam = [target_index.tensor2text(p) for p in predictions]
        predictions2, log_probabilities2 = search_algorithms.beam_search(
            model, 
            X_val2,
            predictions = 6, # max length of the predicted sequence
            beam_width = 3,
            batch_size = 32, 
            progress_bar = 0
        )
        output_beam2 = [target_index.tensor2text(p) for p in predictions2]
        
        predictions_onecodeperdrug = []
        for preds in output_beam:
            interm = []
            for pred in preds:
                clean_pred = pred.replace('<START>', '').replace('<END>', '')
                if len(clean_pred) == 5:
                    interm.append(clean_pred)
            predictions_onecodeperdrug.append(interm)
                
        predictions = []
        for preds in output_beam2:
            interm = []
            for pred in preds:
                clean_pred = pred.replace('<START>', '').replace('<END>', '')
                if len(clean_pred) == 5:
                    interm.append(clean_pred)
            predictions.append(interm)
                
        precisionatn_1, precisionatn_2, precisionatn_3, precisionatn_4 = defined_metrics.precisionatn(predictions_onecodeperdrug, "onecodeperdrug_val_set.csv", 'ATC Codes')
        precision_1, precision_2, precision_3, precision_4 = defined_metrics.precision(predictions, "../../Data/val_set.csv", 'ATC Codes')
        recall_1, recall_2, recall_3, recall_4, comp = defined_metrics.recall(predictions, "../../Data/val_set.csv", 'ATC Codes')
        df_tests.iloc[i, :] = [f"{ep}", f"{random_params['embedding_dim']}", f"{random_params['feedforward_dim']}", f"{random_params['enc_layers']}", f"{random_params['dec_layers']}", f"{random_params['attention_heads']}", f"{random_params['dropout']}", f"{random_params['weight_decays']}", f"{random_params['learning_rates']}", f"{precisionatn_1}", f"{precisionatn_2}", f"{precisionatn_3}", f"{precisionatn_4}", f"{precision_1}", f"{precision_2}", f"{precision_3}", f"{precision_4}", f"{recall_1}", f"{recall_2}", f"{recall_3}", f"{recall_4}", f"{comp}"]
        df_tests.to_csv("transformer_results.csv", index = False)
    sys.stdout = sys.__stdout__
    return df_tests

In [10]:
df_tests = random_search(200)



In [11]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# from matplotlib.lines import Line2D

# # Cargar datos
# df_res = pd.read_csv("results_transformer.csv")
# df_res = df_res[["embedding_dim", "enc_layers", "dec_layers", "attention_heads", "dropout", 
#                  "weight_decay", "learning_rate", "Precision nivel1", "Recall nivel1"]]

# size_map = {32: 100, 64: 200, 128: 300}
# df_res['embedding dimension'] = df_res['embedding_dim'].map(size_map)
# # Normalizar tamaño de los puntos
# # df_res['embedding dimension'] = (df_res['embedding_dim'] - df_res['embedding_dim'].min()) / (df_res['embedding_dim'].max() - df_res['embedding_dim'].min()) * 100 + 50

# df_res['encoder and decoder layers'] = df_res.apply(
#     lambda row: f'enc_{int(row["enc_layers"])} - dec_{int(row["dec_layers"])}', axis=1)
# df_res['weight_decay and learning_rate'] = df_res.apply(
#     lambda row: f'WD_{row["weight_decay"]} - LR_{row["learning_rate"]}', axis=1)

# # Mapear 'dropout' a formas
# dropout_map = {0.0: 'o', 0.1: '^', 0.2: 's'}
# df_res['dropout value'] = df_res['dropout'].map(dropout_map)

# df_res['edgewidth'] = df_res['attention_heads'].map({2: 1.0, 4: 2.0})
# color_palette = sns.color_palette("Pastel1", n_colors=df_res['encoder and decoder layers'].nunique())

# # Mapa de colores para 'weight_decay and learning_rate'
# unique_wd_lr = df_res['weight_decay and learning_rate'].unique()
# color_palette2 = sns.color_palette("Set1", n_colors=len(unique_wd_lr))
# wd_lr_color_map = dict(zip(unique_wd_lr, color_palette2))

# # Mapa de colores para 'encoder and decoder layers'
# unique_enc_dec = df_res['encoder and decoder layers'].unique()
# enc_dec_color_map = dict(zip(unique_enc_dec, color_palette))

# plt.figure(figsize=(12, 12))

# # Dibujar puntos con Matplotlib
# for i, row in df_res.iterrows():
#     plt.scatter(
#         x=row['Precision nivel1'],
#         y=row['Recall nivel1'],
#         s=row['embedding dimension'],  # Tamaño de los puntos
#         color=enc_dec_color_map[row['encoder and decoder layers']],  # Color del punto
#         marker=row['dropout value'],  # Forma del punto
#         edgecolor=wd_lr_color_map[row['weight_decay and learning_rate']],  # Borde basado en mapa precalculado
#         linewidth=row['edgewidth'],  # Grosor del borde
#         alpha=0.7
#     )
    
# legend_elements = [
#     Line2D([0], [0], color='none', label='Dropout'),
#     Line2D([0], [0], marker='o', color='w', markerfacecolor='black', markersize=10, label='Dropout 0.0'),
#     Line2D([0], [0], marker='^', color='w', markerfacecolor='black', markersize=10, label='Dropout 0.1'),
#     Line2D([0], [0], marker='s', color='w', markerfacecolor='black', markersize=10, label='Dropout 0.2'),
#     Line2D([0], [0], color='none', label='Attention heads'),
#     Line2D([0], [0], color='black', linewidth=1.0, label='2 attention heads'),
#     Line2D([0], [0], color='black', linewidth=2.0, label='4 attention heads'),
#     Line2D([0], [0], color='none', label='Encoder - Decoder layers'),
#     *[Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=label) for label, color in enc_dec_color_map.items()],
#     Line2D([0], [0], color='none', label='Weight decay - Learning rate'),
#     *[Line2D([0], [0], marker='o', color='w', markeredgecolor=color, markerfacecolor='none', markersize=10, label=label) for label, color in wd_lr_color_map.items()]
# ]

# size_mapping = {
#     100: "32",
#     200: "64",
#     300: "128"
# }

# size_legend = [
#     Line2D([0], [0], marker='o', color='w', markerfacecolor='black', markersize=size/18, label=f'{size_mapping.get(int(size))}') 
#     for size in df_res['embedding dimension'].unique() 
# ]

# plt.legend(title='Hiperparámetros', handles=[Line2D([0], [0], color='none', label='Embedding Dimension')] + size_legend + legend_elements, loc='upper left', bbox_to_anchor=(1.05, 1))

# plt.title("Combinación de hiperparámetros", fontsize=16)
# plt.xlabel("Precision nivel 1", fontsize=14)
# plt.ylabel("Recall nivel 1", fontsize=14)
# plt.tight_layout()
# plt.savefig('multimodaltransformer_hyperparams.png', bbox_inches='tight')
# plt.show()


In [12]:
df_tests.sort_values(by = "Precision nivel1")

Unnamed: 0,#epochs,embedding_dim,feedforward_dim,enc_layers,dec_layers,attention_heads,dropout,weight_decay,learning_rate,Precisionatn nivel1,...,Precisionatn nivel4,Precision nivel1,Precision nivel2,Precision nivel3,Precision nivel4,Recall nivel1,Recall nivel2,Recall nivel3,Recall nivel4,Drugs that have at least one match
70,12,128,256,4,2,4,0.0,0.0001,0.001,8.823529411764707,...,28.57142857142857,0.09819121447028424,0.7894736842105263,0.3,0.3409090909090909,0.09409991386735572,0.7719298245614036,0.7333333333333333,0.3181818181818182,"[38, 30, 22, 8]"
161,23,128,256,4,3,4,0.1,0.0001,0.001,24.080882352941178,...,54.166666666666664,0.13781223083548674,0.4764150943396227,0.37719298245614036,0.45454545454545453,0.24707149009474594,0.5157232704402516,0.35964912280701755,0.5909090909090909,"[106, 57, 22, 13]"
9,28,64,256,4,2,2,0.2,1e-05,0.001,27.941176470588236,...,14.705882352941178,0.1645133505598623,0.3142857142857143,0.6590909090909091,0.1206896551724138,0.3338501291989664,0.30714285714285716,0.6590909090909091,0.11494252873563217,"[140, 44, 29, 5]"
176,21,128,256,3,4,2,0.2,0.0001,0.001,20.772058823529413,...,55.319148936170215,0.1670973298880275,0.629251700680272,0.5,0.4349593495934959,0.23802756244616713,0.6768707482993198,0.5735294117647058,0.573170731707317,"[98, 68, 41, 25]"
29,51,32,128,2,3,4,0.1,1e-05,0.0001,19.852941176470587,...,53.48837209302325,0.16968130921619298,0.6217228464419475,0.4385964912280703,0.4722222222222222,0.2140396210163652,0.6217228464419475,0.631578947368421,0.5277777777777778,"[89, 57, 36, 22]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,13,128,128,2,3,2,0.0,1e-05,0.001,46.50735294117647,...,73.77049180327869,0.3634797588285962,0.5260416666666666,0.7741046831955924,0.5760517799352751,0.46642262417456204,0.6038049768518519,0.8059425423061786,0.7475728155339806,"[192, 121, 103, 82]"
13,16,32,256,2,2,4,0.0,0.0001,0.001,48.345588235294116,...,61.904761904761905,0.36692506459948315,0.5718954248366014,0.7230392156862746,0.5015015015015016,0.4978466838931955,0.6435185185185185,0.7677345938375351,0.6066066066066066,"[204, 136, 111, 71]"
148,34,128,128,2,4,2,0.2,0.0001,0.001,53.125,...,60.810810810810814,0.37984496124031003,0.6219135802469138,0.7644444444444444,0.4756410256410257,0.5195664656904967,0.6756044238683127,0.8025714285714287,0.6076923076923076,"[216, 150, 130, 83]"
47,18,64,64,2,4,4,0.0,0.0001,0.001,54.96323529411765,...,71.0691823899371,0.37984496124031014,0.6374807987711215,0.7311827956989247,0.6256410256410257,0.526457077232271,0.6874039938556067,0.7903225806451611,0.7289743589743589,"[217, 155, 130, 100]"


In [13]:
(df_tests.sort_values(by = "Precision nivel1")).to_csv("transformer_sortedresults.csv", index = False)