In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 
from collections import Counter

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *

In [2]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

True
11.8


In [3]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)] 
    return ATC_list

def set_seeds(seed):
    np.random.seed(seed)

def create_partitions(df, seed):
    # Create a new column that indicates if the compound has more than 1 ATC code associated (1) or not (0)
    df['multiple_ATC'] = df['ATC Codes'].apply(lambda x: len(convert_string_list(x)) > 1)
    
    # Divide the dataset depending on multiple_ATC column
    group_more_than_one = df[df['multiple_ATC']]  # Compounds with more than one ATC code associated
    group_one = df[~df['multiple_ATC']]          # Compounds with just one ATC code associated

    conteo_longitudes = Counter(len(convert_string_list(codes)) for codes in group_more_than_one['ATC Codes'])
    group_more_than_one = group_more_than_one.reset_index(drop=True)
    group_one = group_one.reset_index(drop=True)

    # Divide each set into train, validation and test subsets
    train_more, test_more = train_test_split(group_more_than_one, test_size=0.2, random_state=seed)
    train_one, test_one = train_test_split(group_one, test_size=0.2, random_state=seed)
    train_more, val_more = train_test_split(train_more, test_size=0.15, random_state=seed)
    train_one, val_one = train_test_split(train_one, test_size=0.15, random_state=seed)
    
    # Combine each set
    train_set = pd.concat([train_more, train_one])
    test_set = pd.concat([test_more, test_one])
    val_set = pd.concat([val_more, val_one])
    train_set = shuffle(train_set, random_state = seed)
    test_set = shuffle(test_set, random_state = seed)
    val_set = shuffle(val_set, random_state = seed)
    return train_set, val_set, test_set

def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set

In [4]:
set_seeds(78)
    
train_set = pd.read_csv(f'train_set.csv')
test_set = pd.read_csv(f'test_set.csv')
val_set = pd.read_csv(f'val_set.csv')

new_train_set = multiplicate_rows(train_set)
new_val_set = multiplicate_rows(val_set)
new_test_set = multiplicate_rows(test_set)

X_train = new_train_set['Neutralized SMILES']
y_train = new_train_set['ATC Codes']
X_test = new_test_set['Neutralized SMILES']
X_test2 = test_set['Neutralized SMILES']
y_test = new_test_set['ATC Codes']

atc_nivel1 = []
for y in y_train:
    atc_nivel1.append(y[0])
y_train_nivel1 = pd.DataFrame(atc_nivel1)
y_train_nivel1 = y_train_nivel1.reset_index(drop=True)

atc_nivel2 = []
for y in y_train:
    atc_nivel2.append(y[1:3])
y_train_nivel2 = pd.DataFrame(atc_nivel2)
y_train_nivel2 = y_train_nivel2.reset_index(drop=True)


atc_nivel3 = []
for y in y_train:
    atc_nivel3.append(y[3:4])
y_train_nivel3 = pd.DataFrame(atc_nivel3)
y_train_nivel3 = y_train_nivel3.reset_index(drop=True)

atc_nivel4 = []
for y in y_train:
    atc_nivel4.append(y[4:5])
y_train_nivel4 = pd.DataFrame(atc_nivel4)
y_train_nivel4 = y_train_nivel4.reset_index(drop=True)

# Contar la frecuencia de cada elemento
conteo1 = Counter(atc_nivel1)
# Calcular la probabilidad de cada elemento
total_elementos1 = len(atc_nivel1)
probabilidades1 = {elemento: frecuencia / total_elementos1 for elemento, frecuencia in conteo1.items()}
nivel1 = np.random.choice(list(probabilidades1.keys()), size=(len(X_test2),20), p=list(probabilidades1.values()))

# Contar la frecuencia de cada elemento
conteo2 = Counter(atc_nivel2)

# Calcular la probabilidad de cada elemento
total_elementos2 = len(atc_nivel2)
probabilidades2 = {elemento: frecuencia / total_elementos2 for elemento, frecuencia in conteo2.items()}
nivel2 = np.random.choice(list(probabilidades2.keys()), size=(len(X_test2),20), p=list(probabilidades2.values()))
    
# Contar la frecuencia de cada elemento
conteo3 = Counter(atc_nivel3)

# Calcular la probabilidad de cada elemento
total_elementos3 = len(atc_nivel3)
probabilidades3 = {elemento: frecuencia / total_elementos3 for elemento, frecuencia in conteo3.items()}
nivel3 = np.random.choice(list(probabilidades3.keys()), size=(len(X_test2),20), p=list(probabilidades3.values()))

# Contar la frecuencia de cada elemento
conteo4 = Counter(atc_nivel4)

# Calcular la probabilidad de cada elemento
total_elementos4 = len(atc_nivel4)
probabilidades4 = {elemento: frecuencia / total_elementos4 for elemento, frecuencia in conteo4.items()}
nivel4 = np.random.choice(list(probabilidades4.keys()), size=(len(X_test2),20), p=list(probabilidades4.values()))

predictions = []
for i, atc1 in enumerate(nivel1):
    codes = []
    for j, code1 in enumerate(atc1):
        codes.append(code1 + nivel2[i][j] + nivel3[i][j] + nivel4[i][j])
    predictions.append(codes)
    
predictions_clean = []
for preds in predictions:
    interm = []
    for pred in preds:
        clean_pred = pred.replace('<START>', '').replace('<END>', '')
        if len(clean_pred) == 5:
            interm.append(clean_pred)
    if len(interm) >= 10:
        predictions_clean.append(interm[0:10])
    else:
        predictions_clean.append(interm)

In [5]:
precisions1, recalls1, f1s1 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 1)
precisions2, recalls2, f1s2 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 2)
precisions3, recalls3, f1s3 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 3)
precisions4, recalls4, f1s4 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 4)
precisions5, recalls5, f1s5 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 5)
precisions6, recalls6, f1s6 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 6)
precisions7, recalls7, f1s7 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 7)
precisions8, recalls8, f1s8 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 8)
precisions9, recalls9, f1s9 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 9)
precisions10, recalls10, f1s10 = defined_metrics.complete_metrics(predictions_clean, f'test_set.csv', 'ATC Codes', 10)

precisions_average1 = sum(precisions1)/len(precisions1)
recalls_average1 = sum(recalls1)/len(recalls1)
f1s_average1 = sum(f1s1)/len(f1s1)

precisions_average2 = sum(precisions2)/len(precisions2)
recalls_average2 = sum(recalls2)/len(recalls2)
f1s_average2 = sum(f1s2)/len(f1s2)

precisions_average3 = sum(precisions3)/len(precisions3)
recalls_average3 = sum(recalls3)/len(recalls3)
f1s_average3 = sum(f1s3)/len(f1s3)

precisions_average4 = sum(precisions4)/len(precisions4)
recalls_average4 = sum(recalls4)/len(recalls4)
f1s_average4 = sum(f1s4)/len(f1s4)

precisions_average5 = sum(precisions5)/len(precisions5)
recalls_average5 = sum(recalls5)/len(recalls5)
f1s_average5 = sum(f1s5)/len(f1s5)

precisions_average6 = sum(precisions6)/len(precisions6)
recalls_average6 = sum(recalls6)/len(recalls6)
f1s_average6 = sum(f1s6)/len(f1s6)

precisions_average7 = sum(precisions7)/len(precisions7)
recalls_average7 = sum(recalls7)/len(recalls7)
f1s_average7 = sum(f1s7)/len(f1s7)

precisions_average8 = sum(precisions8)/len(precisions8)
recalls_average8 = sum(recalls8)/len(recalls8)
f1s_average8 = sum(f1s8)/len(f1s8)

precisions_average9 = sum(precisions9)/len(precisions9)
recalls_average9 = sum(recalls9)/len(recalls9)
f1s_average9 = sum(f1s9)/len(f1s9)

precisions_average10 = sum(precisions10)/len(precisions10)
recalls_average10 = sum(recalls10)/len(recalls10)
f1s_average10 = sum(f1s10)/len(f1s10)

In [6]:
precisions_average_k = []
precisions_average_k.append(precisions_average1)
precisions_average_k.append(precisions_average2)
precisions_average_k.append(precisions_average3)
precisions_average_k.append(precisions_average4)
precisions_average_k.append(precisions_average5)
precisions_average_k.append(precisions_average6)
precisions_average_k.append(precisions_average7)
precisions_average_k.append(precisions_average8)
precisions_average_k.append(precisions_average9)
precisions_average_k.append(precisions_average10)
recalls_average_k = []
recalls_average_k.append(recalls_average1)
recalls_average_k.append(recalls_average2)
recalls_average_k.append(recalls_average3)
recalls_average_k.append(recalls_average4)
recalls_average_k.append(recalls_average5)
recalls_average_k.append(recalls_average6)
recalls_average_k.append(recalls_average7)
recalls_average_k.append(recalls_average8)
recalls_average_k.append(recalls_average9)
recalls_average_k.append(recalls_average10)
f1s_average_k = []
f1s_average_k.append(f1s_average1)
f1s_average_k.append(f1s_average2)
f1s_average_k.append(f1s_average3)
f1s_average_k.append(f1s_average4)
f1s_average_k.append(f1s_average5)
f1s_average_k.append(f1s_average6)
f1s_average_k.append(f1s_average7)
f1s_average_k.append(f1s_average8)
f1s_average_k.append(f1s_average9)
f1s_average_k.append(f1s_average10)

In [7]:
df_results = pd.read_csv("df_metrics.csv")
new_rows = pd.DataFrame(columns=["model", "precision", "recall", "f1"])
new_rows["precision"] = precisions_average_k
new_rows["recall"] = recalls_average_k
new_rows["f1"] = f1s_average_k
new_rows["model"] = "Random Classifier"
df_results = pd.concat([df_results , new_rows], ignore_index=True)
df_results.to_csv("df_metrics.csv", index= False)