In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 
from collections import Counter

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../..')))
from seq2seq import *

In [2]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)] 
    return ATC_list

def set_seeds(seed):
    np.random.seed(seed)

def multiplicate_rows(df):
    # Duplicate each compound the number of ATC codes associated to it, copying its SMILES in new rows
    new_rows = []
    
    for _, row in df.iterrows():
        atc_codes = row['ATC Codes']
        atc_codes_list = convert_string_list(atc_codes)
        
        if len(atc_codes_list) > 1:
            for code in atc_codes_list:
                if len(code) == 5:
                    new_row = row.copy()
                    new_row['ATC Codes'] = code
                    new_rows.append(new_row)
        else:
            if len(atc_codes_list[0]) == 5:
                new_rows.append(row)
    
    new_set = pd.DataFrame(new_rows)
    new_set = new_set.reset_index(drop=True)

    return new_set

In [None]:
seeds = [42, 123, 47899, 2025, 1, 20, 99, 1020, 345, 78] 
columns = [
    'Seed', 
    'Precision', 'Recall', 'F1',
    'Precision_level3', 'Recall_level3', 'F1_level3',
    'Precision_level2', 'Recall_level2', 'F1_level2',
    'Precision level 1', 'Precision level 2', 'Precision level 3', 'Precision level 4',
    'Recall level 1', 'Recall level 2', 'Recall level 3', 'Recall level 4',
    '#Compounds that have at least one match'
]

metrics_df = pd.DataFrame(columns=columns)

for seed in seeds:
    set_seeds(seed)
    
    train_set = pd.read_csv(f'../Datasets/Rep_complete_train_set{seed}.csv')
    test_set = pd.read_csv(f'../Datasets/Rep_test_set{seed}.csv')
    
    new_train_set = multiplicate_rows(train_set)
    new_test_set = multiplicate_rows(test_set)

    X_train = new_train_set['Neutralized SMILES']
    y_train = new_train_set['ATC Codes']
    X_test = new_test_set['Neutralized SMILES']
    X_test2 = test_set['Neutralized SMILES']
    y_test = new_test_set['ATC Codes']
    
    atc_nivel1 = []
    for y in y_train:
        atc_nivel1.append(y[0])
    y_train_nivel1 = pd.DataFrame(atc_nivel1)
    y_train_nivel1 = y_train_nivel1.reset_index(drop=True)

    atc_nivel2 = []
    for y in y_train:
        atc_nivel2.append(y[1:3])
    y_train_nivel2 = pd.DataFrame(atc_nivel2)
    y_train_nivel2 = y_train_nivel2.reset_index(drop=True)


    atc_nivel3 = []
    for y in y_train:
        atc_nivel3.append(y[3:4])
    y_train_nivel3 = pd.DataFrame(atc_nivel3)
    y_train_nivel3 = y_train_nivel3.reset_index(drop=True)

    atc_nivel4 = []
    for y in y_train:
        atc_nivel4.append(y[4:5])
    y_train_nivel4 = pd.DataFrame(atc_nivel4)
    y_train_nivel4 = y_train_nivel4.reset_index(drop=True)

    # Contar la frecuencia de cada elemento
    conteo1 = Counter(atc_nivel1)
    # Calcular la probabilidad de cada elemento
    total_elementos1 = len(atc_nivel1)
    probabilidades1 = {elemento: frecuencia / total_elementos1 for elemento, frecuencia in conteo1.items()}
    nivel1 = np.random.choice(list(probabilidades1.keys()), size=(len(X_test2),20), p=list(probabilidades1.values()))

    # Contar la frecuencia de cada elemento
    conteo2 = Counter(atc_nivel2)
    
    # Calcular la probabilidad de cada elemento
    total_elementos2 = len(atc_nivel2)
    probabilidades2 = {elemento: frecuencia / total_elementos2 for elemento, frecuencia in conteo2.items()}
    nivel2 = np.random.choice(list(probabilidades2.keys()), size=(len(X_test2),20), p=list(probabilidades2.values()))
        
    # Contar la frecuencia de cada elemento
    conteo3 = Counter(atc_nivel3)
    
    # Calcular la probabilidad de cada elemento
    total_elementos3 = len(atc_nivel3)
    probabilidades3 = {elemento: frecuencia / total_elementos3 for elemento, frecuencia in conteo3.items()}
    nivel3 = np.random.choice(list(probabilidades3.keys()), size=(len(X_test2),20), p=list(probabilidades3.values()))

    # Contar la frecuencia de cada elemento
    conteo4 = Counter(atc_nivel4)
    
    # Calcular la probabilidad de cada elemento
    total_elementos4 = len(atc_nivel4)
    probabilidades4 = {elemento: frecuencia / total_elementos4 for elemento, frecuencia in conteo4.items()}
    nivel4 = np.random.choice(list(probabilidades4.keys()), size=(len(X_test2),20), p=list(probabilidades4.values()))

    predictions = []
    for i, atc1 in enumerate(nivel1):
        codes = []
        for j, code1 in enumerate(atc1):
            codes.append(code1 + nivel2[i][j] + nivel3[i][j] + nivel4[i][j])
        predictions.append(codes)
    predictions_clean = []
    counter4_lessthan3 = 0
    for preds in predictions:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            if len(clean_pred) == 5:
                interm.append(clean_pred)
        if len(interm) >= 3:
            predictions_clean.append(interm[0:3])
        else:
            counter4_lessthan3 += 1
            predictions_clean.append(interm)
    print(f"The model predicted less than 3 ATC codes of level 4 for {counter4_lessthan3} compounds")                 
    predictions_clean_level3 = []
    counter3_lessthan3 = 0
    for preds in predictions:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_3 = clean_pred[0:4]
            if len(pred_3) == 4 and pred_3 not in interm:
                interm.append(pred_3)
        if len(interm) >= 3:
            predictions_clean_level3.append(interm[0:3])
        else:
            counter3_lessthan3 += 1
            predictions_clean_level3.append(interm)
    print(f"The model predicted less than 3 ATC codes of level 3 for {counter3_lessthan3} compounds")       
    predictions_clean_level2 = []
    counter2_lessthan3 = 0
    for preds in predictions:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            pred_2 = clean_pred[0:3]
            if len(pred_2) == 3 and pred_2 not in interm:
                interm.append(pred_2)
        if len(interm) >= 3:
            predictions_clean_level2.append(interm[0:3])
        else:
            counter2_lessthan3 += 1
            predictions_clean_level2.append(interm)
    print(f"The model predicted less than 3 ATC codes of level 2 for {counter2_lessthan3} compounds")       
    precision_1, precision_2, precision_3, precision_4 = defined_metrics.precision(predictions_clean, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes')
    recall_1, recall_2, recall_3, recall_4, counter_compound_match = defined_metrics.recall(predictions_clean, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes')
    precisions, recalls, f1s = defined_metrics.complete_metrics(predictions_clean, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level3, recalls_level3, f1s_level3 = defined_metrics.complete_metrics_level3(predictions_clean_level3, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes', 3)
    precisions_level2, recalls_level2, f1s_level2 = defined_metrics.complete_metrics_level2(predictions_clean_level2, f'../Datasets/Rep_test_set{seed}.csv', 'ATC Codes', 3)
    precisions_average = sum(precisions)/len(precisions)
    recalls_average = sum(recalls)/len(recalls)
    f1s_average = sum(f1s)/len(f1s)

    precisions_average_level3 = sum(precisions_level3)/len(precisions_level3)
    recalls_average_level3 = sum(recalls_level3)/len(recalls_level3)
    f1s_average_level3 = sum(f1s_level3)/len(f1s_level3)

    precisions_average_level2 = sum(precisions_level2)/len(precisions_level2)
    recalls_average_level2 = sum(recalls_level2)/len(recalls_level2)
    f1s_average_level2 = sum(f1s_level2)/len(f1s_level2)
    
    precisions_average_level3 = sum(precisions_level3)/len(precisions_level3)
    recalls_average_level3 = sum(recalls_level3)/len(recalls_level3)
    f1s_average_level3 = sum(f1s_level3)/len(f1s_level3)
        
    metrics = {
        'Precision': precisions_average, 
        'Recall': recalls_average,
        'F1': f1s_average,
        'Precision_level3': precisions_average_level3, 
        'Recall_level3': recalls_average_level3,
        'F1_level3': f1s_average_level3,
        'Precision_level2': precisions_average_level2, 
        'Recall_level2': recalls_average_level2,
        'F1_level2': f1s_average_level2,
        'Precision level 1': precision_1,
        'Precision level 2': precision_2,
        'Precision level 3': precision_3,
        'Precision level 4': precision_4,
        'Recall level 1': recall_1,
        'Recall level 2': recall_2,
        'Recall level 3': recall_3,
        'Recall level 4': recall_4,
        '#Compounds that have at least one match': counter_compound_match
    }
    
    
    # Build the row
    row = {
        'Seed': seed,
        **metrics
    }
    
    metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)

metrics_df.to_csv("random_metrics.csv", index=False)
print("Mean:", metrics_df.mean(numeric_only=True))
print("Std:", metrics_df.std(numeric_only=True))

The model predicted less than 3 ATC codes of level 4 for 0 compounds
The model predicted less than 3 ATC codes of level 3 for 0 compounds
The model predicted less than 3 ATC codes of level 2 for 0 compounds


  metrics_df = pd.concat([metrics_df, pd.DataFrame([row])], ignore_index=True)


The model predicted less than 3 ATC codes of level 4 for 0 compounds
The model predicted less than 3 ATC codes of level 3 for 0 compounds
The model predicted less than 3 ATC codes of level 2 for 0 compounds
The model predicted less than 3 ATC codes of level 4 for 0 compounds
The model predicted less than 3 ATC codes of level 3 for 0 compounds
The model predicted less than 3 ATC codes of level 2 for 0 compounds
The model predicted less than 3 ATC codes of level 4 for 0 compounds
The model predicted less than 3 ATC codes of level 3 for 0 compounds
The model predicted less than 3 ATC codes of level 2 for 0 compounds
The model predicted less than 3 ATC codes of level 4 for 0 compounds
The model predicted less than 3 ATC codes of level 3 for 0 compounds
The model predicted less than 3 ATC codes of level 2 for 0 compounds
The model predicted less than 3 ATC codes of level 4 for 0 compounds
The model predicted less than 3 ATC codes of level 3 for 0 compounds
The model predicted less than 3 AT