In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
from seq2seq import *

import numpy as np
import pandas as pd
import random

from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split 

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
def set_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[1:len(element)-1]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split(', '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[1:len(code)-1]
    return ATC_list

In [3]:
seed = 78
set_seeds(seed)

In [4]:
hyperparameters_grid = { 
    'n_estimators': [50, 100, 500],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced'],
}

In [5]:
def train_test_1(seed):
    train_set = pd.read_csv(f'../../Data/train_set.csv')
    val_set = pd.read_csv(f'../../Data/val_set.csv')
    # test_set = pd.read_csv(f'../../Data/test_set.csv')
    # train_set = pd.concat([train_set, val_set], ignore_index=True)
    # Delete unnecessary columns from train set
    train_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    train_set.drop('ATC Codes', axis = 1, inplace = True)
    train_set.drop('ATC_level2', axis = 1, inplace = True)
    train_set.drop('ATC_level3', axis = 1, inplace = True)
    train_set.drop('ATC_level4', axis = 1, inplace = True)
    train_set.drop('multiple_ATC', axis = 1, inplace = True)
    train_set = train_set.reset_index(drop=True)
    # Delete unnecessary columns from test set
    val_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    val_set.drop('ATC Codes', axis = 1, inplace = True)
    val_set.drop('ATC_level2', axis = 1, inplace = True)
    val_set.drop('ATC_level3', axis = 1, inplace = True)
    val_set.drop('ATC_level4', axis = 1, inplace = True)
    val_set.drop('multiple_ATC', axis = 1, inplace = True)
    test_set = val_set.reset_index(drop=True)
    # Divide in X and y
    X_train = train_set.drop('ATC_level1', axis = 1)
    y_train = train_set['ATC_level1']
    X_test = test_set.drop('ATC_level1', axis = 1)
    y_test = test_set['ATC_level1']
    return X_train, y_train, X_test, y_test
def train_test_2(seed):
    train_set = pd.read_csv(f'../../Data/train_set.csv')
    val_set = pd.read_csv(f'../../Data/val_set.csv')
    # test_set = pd.read_csv(f'../../Data/test_set.csv')
    # train_set = pd.concat([train_set, val_set], ignore_index=True)
    # Delete unnecessary columns 
    train_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    train_set.drop('ATC Codes', axis = 1, inplace = True)
    train_set.drop('ATC_level3', axis = 1, inplace = True)
    train_set.drop('ATC_level4', axis = 1, inplace = True)
    train_set.drop('multiple_ATC', axis = 1, inplace = True)
    train_set = train_set.reset_index(drop=True)
    # Delete unnecessary columns 
    val_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    val_set.drop('ATC Codes', axis = 1, inplace = True)
    val_set.drop('ATC_level3', axis = 1, inplace = True)
    val_set.drop('ATC_level4', axis = 1, inplace = True)
    val_set.drop('multiple_ATC', axis = 1, inplace = True)
    test_set = val_set.reset_index(drop=True)
    # Replicate compounds that have more than 1 ATC level 1 code
    new_rows = []

    for _, row in train_set.iterrows():
        ATC_level1_list = convert_string_list(row['ATC_level1'])
        for code in ATC_level1_list:
            new_row = row.copy()
            new_row['ATC_level1'] = code
            new_rows.append(new_row)

    new_train_set = pd.DataFrame(new_rows)
    new_train_set = new_train_set.reset_index(drop=True)
    # Delete level 1 letter from ATC_level2
    new_rows = [] 
    for _, row in new_train_set.iterrows():
        ATC_level2_list = convert_string_list(row['ATC_level2'])
        # Split ATC code if they have more than 1 code at level 2
        for code in ATC_level2_list:
            if code[0] == row['ATC_level1']:
                new_row = row.copy()
                new_row['ATC_level2'] = code[1:len(code)]
                new_rows.append(new_row)

    new_train_set2 = pd.DataFrame(new_rows)
    new_train_set2 = new_train_set2.reset_index(drop=True)
    
    new_test_set2 = test_set
    
    X_train = new_train_set2.drop('ATC_level2', axis = 1)
    y_train = new_train_set2['ATC_level2']
    X_test = new_test_set2.drop('ATC_level2', axis = 1)
    y_test = new_test_set2['ATC_level2']
    
    return X_train, y_train, X_test, y_test
def train_test_3(seed):
    train_set = pd.read_csv(f'../../Data/train_set.csv')
    val_set = pd.read_csv(f'../../Data/val_set.csv')
    # test_set = pd.read_csv(f'../../Data/test_set.csv')
    # train_set = pd.concat([train_set, val_set], ignore_index=True)
    # Delete unnecessary columns 
    train_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    train_set.drop('ATC Codes', axis = 1, inplace = True)
    train_set.drop('ATC_level1', axis = 1, inplace = True)
    train_set.drop('ATC_level4', axis = 1, inplace = True)
    train_set.drop('multiple_ATC', axis = 1, inplace = True)
    train_set = train_set.reset_index(drop=True)
    # Delete unnecessary columns 
    val_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    val_set.drop('ATC Codes', axis = 1, inplace = True)
    val_set.drop('ATC_level1', axis = 1, inplace = True)
    val_set.drop('ATC_level4', axis = 1, inplace = True)
    val_set.drop('multiple_ATC', axis = 1, inplace = True)
    test_set = val_set.reset_index(drop=True)
    # Replicate compounds that have more than 1 ATC code
    new_rows = []

    for _, row in train_set.iterrows():
        ATC_level2_list = convert_string_list(row['ATC_level2'])
        for code in ATC_level2_list:
            new_row = row.copy()
            new_row['ATC_level2'] = code
            new_rows.append(new_row)

    new_train_set = pd.DataFrame(new_rows)
    new_train_set = new_train_set.reset_index(drop=True)
    # Delete level 1 letter from ATC_level2
    new_rows = [] 
    for _, row in new_train_set.iterrows():
        ATC_level3_list = convert_string_list(row['ATC_level3'])
        # Split ATC code if they have more than 1 code at level 2
        for code in ATC_level3_list:
            if code[0:3] == row['ATC_level2']:
                new_row = row.copy()
                new_row['ATC_level3'] = code[3:len(code)]
                new_rows.append(new_row)

    new_train_set2 = pd.DataFrame(new_rows)
    new_train_set2 = new_train_set2.reset_index(drop=True)

    new_test_set2 = test_set

    X_train = new_train_set2.drop('ATC_level3', axis = 1)
    y_train = new_train_set2['ATC_level3']
    X_test = new_test_set2.drop('ATC_level3', axis = 1)
    y_test = new_test_set2['ATC_level3']
    
    return X_train, y_train, X_test, y_test
def train_test_4(seed):
    train_set = pd.read_csv(f'../../Data/train_set.csv')
    val_set = pd.read_csv(f'../../Data/val_set.csv')
    # test_set = pd.read_csv(f'../../Data/test_set.csv')
    # train_set = pd.concat([train_set, val_set], ignore_index=True)
    # Delete unnecessary columns 
    train_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    train_set.drop('ATC Codes', axis = 1, inplace = True)
    train_set.drop('ATC_level1', axis = 1, inplace = True)
    train_set.drop('ATC_level2', axis = 1, inplace = True)
    train_set.drop('multiple_ATC', axis = 1, inplace = True)
    train_set = train_set.reset_index(drop=True)
    # Delete unnecessary columns 
    val_set.drop('Neutralized SMILES', axis = 1, inplace = True)
    val_set.drop('ATC Codes', axis = 1, inplace = True)
    val_set.drop('ATC_level1', axis = 1, inplace = True)
    val_set.drop('ATC_level2', axis = 1, inplace = True)
    val_set.drop('multiple_ATC', axis = 1, inplace = True)
    test_set = val_set.reset_index(drop=True)
    # Replicate compounds that have more than 1 ATC code
    new_rows = []

    for _, row in train_set.iterrows():
        ATC_level3_list = convert_string_list(row['ATC_level3'])
        for code in ATC_level3_list:
            new_row = row.copy()
            new_row['ATC_level3'] = code
            new_rows.append(new_row)

    new_train_set = pd.DataFrame(new_rows)
    new_train_set = new_train_set.reset_index(drop=True)
    # Delete level 1 letter from ATC_level2
    new_rows = [] 
    for _, row in new_train_set.iterrows():
        ATC_level4_list = convert_string_list(row['ATC_level4'])
        # Split ATC code if they have more than 1 code at level 2
        for code in ATC_level4_list:
            if code[0:4] == row['ATC_level3']:
                if code[4:len(code)] != '':
                    new_row = row.copy()
                    new_row['ATC_level4'] = code[4:len(code)]
                    new_rows.append(new_row)

    new_train_set2 = pd.DataFrame(new_rows)
    new_train_set2 = new_train_set2.reset_index(drop=True)

    new_test_set2 = test_set

    X_train = new_train_set2.drop('ATC_level4', axis = 1)
    y_train = new_train_set2['ATC_level4']
    X_test = new_test_set2.drop('ATC_level4', axis = 1)
    y_test = new_test_set2['ATC_level4']
    
    return X_train, y_train, X_test, y_test

In [6]:
def test_set_level1(X_test):
    X_test = np.asarray(X_test).astype(np.float32)
    X_test[pd.isna(X_test)] = np.nanmedian(X_test)
    X_test = scaler1.transform(X_test)
    return X_test
def test_set_level2(X_test1, pred_df_level1):
    X_test1.drop(labels=['ATC_level1'], axis="columns", inplace=True)
    X_test1['ATC_level1'] = pred_df_level1['pred_1']
    ATC_level1 = X_test1['ATC_level1']
    categorical_atc = atc_level1_labels_encoder2.transform(ATC_level1)
    df_level1 = pd.DataFrame(categorical_atc, columns=atc_level1_labels2)
    X_test1 = pd.concat([X_test1, df_level1], axis = 1)
    X_test1.drop(labels=['ATC_level1'], axis="columns", inplace=True)

    X_test1 = np.asarray(X_test1).astype(np.float32)
    X_test1[pd.isna(X_test1)] = np.nanmedian(X_test1)
    X_test1 = scaler2.transform(X_test1)

    return X_test1
def test_set_level3(X_test1, pred_df_level1, pred_df_level2):
    predicted_codes = []
    
    for index, pred1 in enumerate(pred_df_level1['pred_1']):
        pred2 = str(pred_df_level2.at[pred_df_level2.index[index], 'pred_2']).zfill(2)
        prediction = pred1 + '' + pred2
        predicted_codes.append(prediction)
        
    X_test1['ATC_level2'] = predicted_codes
    
    ATC_level22 = X_test1['ATC_level2']
    ATC_level1_3 = ATC_level22.copy()
    ATC_level2_3 = ATC_level22.copy()
    for index, atc in enumerate(ATC_level22):
        ATC_level1_3[index] = []
        ATC_level1_3[index].append(atc[0:1])
        ATC_level2_3[index] = []
        ATC_level2_3[index].append(atc[1:3])

    X_test1 = X_test1.drop(labels=['ATC_level2'], axis="columns")
    
    categorical_atc1_3 = atc_level1_labels_encoder3.transform(ATC_level1_3)
    categorical_atc2_3 = atc_level2_labels_encoder3.transform(ATC_level2_3)
    df_level1_3 = pd.DataFrame(categorical_atc1_3, columns=atc_level1_labels3)
    df_level2_3 = pd.DataFrame(categorical_atc2_3, columns=atc_level2_labels3)
    X_test1 = pd.concat([X_test1, df_level1_3, df_level2_3], axis = 1)                         
    
    X_test1 = np.asarray(X_test1).astype(np.float32)
    X_test1[pd.isna(X_test1)] = np.nanmedian(X_test1)
    X_test1 = scaler3.transform(X_test1)

    return X_test1
def test_set_level4(X_test1, pred_df_level1, pred_df_level2, pred_df_level3):
    predicted_codes = []
    
    for index, pred1 in enumerate(pred_df_level1['pred_1']):
        pred2 = str(pred_df_level2.at[pred_df_level2.index[index], 'pred_2']).zfill(2)
        pred3 = pred_df_level3.at[pred_df_level3.index[index], 'pred_3']
        prediction = pred1 + '' + pred2 + '' + pred3
        predicted_codes.append(prediction)
        
    X_test1['ATC_level3'] = predicted_codes
    
    ATC_level33 = X_test1['ATC_level3']
    ATC_level1_4 = ATC_level33.copy()
    ATC_level2_4 = ATC_level33.copy()
    ATC_level3_4 = ATC_level33.copy()
    for index, atc in enumerate(ATC_level33):
        ATC_level1_4[index] = []
        ATC_level1_4[index].append(atc[0:1])
        ATC_level2_4[index] = []
        ATC_level2_4[index].append(atc[1:3])
        ATC_level3_4[index] = []
        ATC_level3_4[index].append(atc[3:4])

    X_test1 = X_test1.drop(labels=['ATC_level3'], axis="columns")

    categorical_atc1_4 = atc_level1_labels_encoder4.transform(ATC_level1_4)
    categorical_atc2_4 = atc_level2_labels_encoder4.transform(ATC_level2_4)
    categorical_atc3_4 = atc_level3_labels_encoder4.transform(ATC_level3_4)
    df_level1_4 = pd.DataFrame(categorical_atc1_4, columns=atc_level1_labels4)
    df_level2_4 = pd.DataFrame(categorical_atc2_4, columns=atc_level2_labels4)
    df_level3_4 = pd.DataFrame(categorical_atc3_4, columns=atc_level3_labels4)
    X_test1 = pd.concat([X_test1, df_level1_4, df_level2_4, df_level3_4], axis = 1)                         
   
    X_test1 = np.asarray(X_test1).astype(np.float32)
    X_test1[pd.isna(X_test1)] = np.nanmedian(X_test1)
    X_test1 = scaler4.transform(X_test1)

    return X_test1

In [7]:
def generate_predictions(model1, X_test1, model2, X_test2, model3, X_test3, model4, X_test4, previous_predictions = None, index = None):
    """Genera predicciones para todos los niveles, o solo para un índice si se repite"""
    
    def sample_prediction(model, X_test, encoder, previous_predictions = None, index = None):
        """Calcula la predicción con probabilidad ponderada para un índice o todos"""
        if previous_predictions is None:
            # Primera vez: calcular todas las predicciones
            y_prob = model.predict_proba(X_test)
            y_prob_matrix = np.array(y_prob)[:, :, 1].T
            predictions = [
                random.choices(encoder.classes_, weights=row, k=1)[0] for row in y_prob_matrix
            ]
        else:
            # Mantener las predicciones anteriores y cambiar solo la del índice dado
            predictions = previous_predictions.copy()
            if index is not None:
                y_prob = model.predict_proba(X_test)
                row = np.array(y_prob)[:, :, 1].T[index]
                predictions[index] = random.choices(encoder.classes_, weights=row, k=1)[0]
    
        return predictions
    
    # Nivel 1
    X_test1 = test_set_level1(X_test1)
    pred_1 = sample_prediction(model1, X_test1, mlb, previous_predictions['pred_1'] if previous_predictions is not None else None, index)
    pred_df_level1 = pd.DataFrame(pred_1, columns=['pred_1'])

    # Nivel 2
    X_test2 = test_set_level2(X_test2, pred_df_level1)
    pred_2 = sample_prediction(model2, X_test2, y_labels_encoder2, previous_predictions['pred_2'] if previous_predictions is not None else None, index)
    pred_df_level2 = pd.DataFrame(pred_2, columns=['pred_2'])

    # Nivel 3
    X_test3 = test_set_level3(X_test3, pred_df_level1, pred_df_level2)
    pred_3 = sample_prediction(model3, X_test3, y_labels_encoder3, previous_predictions['pred_3'] if previous_predictions is not None else None, index)
    pred_df_level3 = pd.DataFrame(pred_3, columns=['pred_3'])

    # Nivel 4
    X_test4 = test_set_level4(X_test4, pred_df_level1, pred_df_level2, pred_df_level3)
    pred_4 = sample_prediction(model4, X_test4, y_labels_encoder4, previous_predictions['pred_4'] if previous_predictions is not None else None, index)
    pred_df_level4 = pd.DataFrame(pred_4, columns=['pred_4'])

    return pred_df_level1, pred_df_level2, pred_df_level3, pred_df_level4

In [8]:
# Predict probabilities
def random_predictions(model1, X_test1, model2, X_test2, model3, X_test3, model4, X_test4):
    final_predictions = [[] for _ in range(len(X_test1))]
    max_attempts = 30
    for i in range(3):
        pred_df_level1, pred_df_level2, pred_df_level3, pred_df_level4 = generate_predictions(model1, X_test1, model2, X_test2, model3, X_test3, model4, X_test4)
        for index in range(len(pred_df_level1)):
            attempts = 0
            while attempts < max_attempts:
                pred1 = pred_df_level1.at[index, 'pred_1']
                pred2 = str(pred_df_level2.at[index, "pred_2"]).zfill(2)
                pred3 = pred_df_level3.at[index, "pred_3"]
                pred4 = pred_df_level4.at[index, "pred_4"]
                prediction = pred1 + pred2 + pred3 + pred4
                
                if prediction not in final_predictions[index]:
                    final_predictions[index].append(prediction)
                    break  # Salimos del bucle cuando obtenemos una predicción nueva

                # print(f"{attempts}- Prediction {prediction} already found in {final_predictions[index]}, reclassifying index {index}...")
                previous_predictions = pd.concat([pred_df_level1, pred_df_level2, pred_df_level3, pred_df_level4], axis = 1)
                # Recalcular la clasificación completa solo para este índice
                pred_df_level1, pred_df_level2, pred_df_level3, pred_df_level4 = generate_predictions(model1, X_test1, model2, X_test2, model3, X_test3, model4, X_test4, previous_predictions, index)
                attempts += 1
    return final_predictions

In [9]:
import itertools
combinations = list(itertools.product(*hyperparameters_grid.values()))
if len(combinations)>200:
    max_evals = 200
else:
    max_evals = len(combinations)
tested_params = set()
df_tests = pd.DataFrame(columns = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'class_weight', 'Precision nivel1', 'Precision nivel2', 'Precision nivel3', 'Precision nivel4', 'Recall nivel1', 'Recall nivel2', 'Recall nivel3', 'Recall nivel4', 'Drugs that have at least one match'], index = list(range(max_evals)))
sys.stdout = open('log.txt', 'w')
for comb in range(max_evals):
    while True:
        random_params = {k: random.sample(v, 1)[0] for k, v in hyperparameters_grid.items()}
        params_tuple = tuple(random_params.values())
        if params_tuple not in tested_params:
            tested_params.add(params_tuple)
            break   
    X_train1, y_train1, X_test1, y_test1 = train_test_1(seed)
    X_train2, y_train2, X_test2, y_test2 = train_test_2(seed)
    X_train3, y_train3, X_test3, y_test3 = train_test_3(seed)
    X_train4, y_train4, X_test4, y_test4 = train_test_4(seed)
    # LEVEL1
    # Get all available labels describing the level 1 ATC code
    labels1 = set()
    for lista in y_train1:
        lista = convert_string_list(lista)
        for code in lista:
            labels1.add(code)
    for lista in y_test1:
        lista = convert_string_list(lista)
        for code in lista:
            labels1.add(code)
            
    labels1 = sorted(list(labels1))
    mlb = MultiLabelBinarizer()
    mlb.fit([labels1])
    y_new = y_train1.copy()
    for index, lista in enumerate(y_train1):
        y_new[index] = []
        lista = convert_string_list(lista)
        for i, label in enumerate(lista):
            y_new[index].append(lista[i])
    y_categorical1 = mlb.transform(y_new)
    
    X_train1 = np.asarray(X_train1).astype(np.float32)
    y_categorical1 = np.asarray(y_categorical1).astype(np.float32)
    # Complete NaN values in each column with the median
    X_train1[pd.isna(X_train1)] = np.nanmedian(X_train1)
    # Define an instance of the MinMaxScaler
    scaler1 = MinMaxScaler()
    # Fit the scaler to the data and transform it
    X_train1 = scaler1.fit_transform(X_train1)
    
    rf1 = RandomForestClassifier(n_estimators=random_params['n_estimators'], max_depth=random_params['max_depth'], min_samples_split=random_params['min_samples_split'], min_samples_leaf=random_params['min_samples_leaf'], class_weight = random_params['class_weight'], random_state=seed)
    # Train the model
    rf1.fit(X_train1, y_categorical1)
    
    #LEVEL 2
    # Get all available labels describing the level 1 ATC code
    labels2 = set()
    for code in y_train2:
        labels2.add(code)
            
    labels2 = sorted(list(labels2))
    y_labels_encoder2 = MultiLabelBinarizer()
    y_labels_encoder2.fit([labels2])
    encoded_y_train2 = y_labels_encoder2.transform(y_train2.values.reshape(-1, 1))
    atc_level1_labels2 = set()
    for _, row in X_train2.iterrows():
        atc_level1_labels2.add(row['ATC_level1'])
            
    atc_level1_labels2 = sorted(list(atc_level1_labels2))
    atc_level1_labels_encoder2 = MultiLabelBinarizer()
    atc_level1_labels_encoder2.fit([atc_level1_labels2])
    ATC_level11 = X_train2['ATC_level1']
    ATC_level1_2 = ATC_level11.copy()
    for index, lista in enumerate(ATC_level11):
        ATC_level1_2[index] = []
        ATC_level1_2[index].append(lista)
    categorical_atc2 = atc_level1_labels_encoder2.transform(ATC_level1_2)
    X_train2.drop(labels=['ATC_level1'], axis="columns", inplace=True)
    df_level1_2 = pd.DataFrame(categorical_atc2, columns=atc_level1_labels2)
    X_train2 = pd.concat([X_train2, df_level1_2], axis = 1)
    X_train2 = np.asarray(X_train2).astype(np.float32)
    encoded_y_train2 = np.asarray(encoded_y_train2).astype(np.float32)
    # Complete NaN values in each column with the median
    X_train2[pd.isna(X_train2)] = np.nanmedian(X_train2)
    # Define an instance of the MinMaxScaler
    scaler2 = MinMaxScaler()
    # Fit the scaler to the data and transform it
    X_train2 = scaler2.fit_transform(X_train2)
    
    rf2 = RandomForestClassifier(n_estimators=random_params['n_estimators'], max_depth=random_params['max_depth'], min_samples_split=random_params['min_samples_split'], min_samples_leaf=random_params['min_samples_leaf'], class_weight = random_params['class_weight'], random_state=seed)
    # Train the model
    rf2.fit(X_train2, encoded_y_train2)
    
    #LEVEL 3
    # Get all available labels describing the level 1 ATC code
    labels3 = set()
    for code in y_train3:
        labels3.add(code)
            
    labels3 = sorted(list(labels3))
    y_labels_encoder3 = MultiLabelBinarizer()
    y_labels_encoder3.fit([labels3])
    encoded_y_train3 = y_labels_encoder3.transform(y_train3.values.reshape(-1, 1))
    
    atc_level1_labels3 = set()
    atc_level2_labels3 = set()
    for _, row in X_train3.iterrows():
        atc_level1_labels3.add(row['ATC_level2'][0:1])
        atc_level2_labels3.add(row['ATC_level2'][1:3])
    for _, row in X_test3.iterrows():
        lista = convert_string_list(row['ATC_level2'])
        for code in lista:
            atc_level1_labels3.add(code[0:1])
            atc_level2_labels3.add(code[1:3])
            
    atc_level1_labels3 = sorted(list(atc_level1_labels3))
    atc_level2_labels3 = sorted(list(atc_level2_labels3))
    atc_level1_labels_encoder3 = MultiLabelBinarizer()
    atc_level1_labels_encoder3.fit([atc_level1_labels3])
    atc_level2_labels_encoder3 = MultiLabelBinarizer()
    atc_level2_labels_encoder3.fit([atc_level2_labels3])
    ATC_level22 = X_train3['ATC_level2']
    ATC_level1_3 = ATC_level22.copy()
    ATC_level2_3 = ATC_level22.copy()
    for index, lista in enumerate(ATC_level22):
        ATC_level1_3[index] = []
        ATC_level1_3[index].append(lista[0:1])
    for index, lista in enumerate(ATC_level22):
        ATC_level2_3[index] = []
        ATC_level2_3[index].append(lista[1:3])
    X_train3.drop(labels=['ATC_level2'], axis="columns", inplace=True)
    categorical_atc1_3 = atc_level1_labels_encoder3.transform(ATC_level1_3)
    categorical_atc2_3 = atc_level2_labels_encoder3.transform(ATC_level2_3)
    df_level1_3 = pd.DataFrame(categorical_atc1_3, columns=atc_level1_labels3)
    df_level2_3 = pd.DataFrame(categorical_atc2_3, columns=atc_level2_labels3)
    X_train3 = pd.concat([X_train3, df_level1_3, df_level2_3], axis = 1)
    X_train3 = np.asarray(X_train3).astype(np.float32)
    encoded_y_train3 = np.asarray(encoded_y_train3).astype(np.float32)
    # Complete NaN values in each column with the median
    X_train3[pd.isna(X_train3)] = np.nanmedian(X_train3)
    # Define an instance of the MinMaxScaler
    scaler3 = MinMaxScaler()
    # Fit the scaler to the data and transform it
    X_train3 = scaler3.fit_transform(X_train3)
    
    rf3 = RandomForestClassifier(n_estimators=random_params['n_estimators'], max_depth=random_params['max_depth'], min_samples_split=random_params['min_samples_split'], min_samples_leaf=random_params['min_samples_leaf'], class_weight = random_params['class_weight'], random_state=seed)
    # Train the model
    rf3.fit(X_train3, encoded_y_train3)
    #LEVEL 4
    # Get all available labels describing the level 1 ATC code
    labels4 = set()
    for code in y_train4:
        labels4.add(code)
            
    labels4 = sorted(list(labels4))
    y_labels_encoder4 = MultiLabelBinarizer()
    y_labels_encoder4.fit([labels4])
    encoded_y_train4 = y_labels_encoder4.transform(y_train4.values.reshape(-1, 1))
    
    atc_level1_labels4 = set()
    atc_level2_labels4 = set()
    atc_level3_labels4 = set()
    for _, row in X_train4.iterrows():
        atc_level1_labels4.add(row['ATC_level3'][0:1])
        atc_level2_labels4.add(row['ATC_level3'][1:3])
        atc_level3_labels4.add(row['ATC_level3'][3:4])
    for _, row in X_test4.iterrows():
        lista = convert_string_list(row['ATC_level3'])
        for code in lista:
            atc_level1_labels4.add(code[0:1])
            atc_level2_labels4.add(code[1:3])
            atc_level3_labels4.add(code[3:4])
    
    atc_level1_labels4 = sorted(list(atc_level1_labels4))
    atc_level2_labels4 = sorted(list(atc_level2_labels4))
    atc_level3_labels4 = sorted(list(atc_level3_labels4))
    atc_level1_labels_encoder4 = MultiLabelBinarizer()
    atc_level1_labels_encoder4.fit([atc_level1_labels4])
    atc_level2_labels_encoder4 = MultiLabelBinarizer()
    atc_level2_labels_encoder4.fit([atc_level2_labels4])
    atc_level3_labels_encoder4 = MultiLabelBinarizer()
    atc_level3_labels_encoder4.fit([atc_level3_labels4])
    ATC_level33 = X_train4['ATC_level3']
    ATC_level1_4 = ATC_level33.copy()
    for index, lista in enumerate(ATC_level33):
        ATC_level1_4[index] = []
        ATC_level1_4[index].append(lista[0:1])
    ATC_level2_4 = ATC_level33.copy()
    for index, lista in enumerate(ATC_level33):
        ATC_level2_4[index] = []
        ATC_level2_4[index].append(lista[1:3])
    ATC_level3_4 = ATC_level33.copy()
    for index, lista in enumerate(ATC_level33):
        ATC_level3_4[index] = []
        ATC_level3_4[index].append(lista[3:4])
    X_train4.drop(labels=['ATC_level3'], axis="columns", inplace=True)
    categorical_atc1_4 = atc_level1_labels_encoder4.transform(ATC_level1_4)
    categorical_atc2_4 = atc_level2_labels_encoder4.transform(ATC_level2_4)
    categorical_atc3_4 = atc_level3_labels_encoder4.transform(ATC_level3_4)
    df_level1_4 = pd.DataFrame(categorical_atc1_4, columns=atc_level1_labels4)
    df_level2_4 = pd.DataFrame(categorical_atc2_4, columns=atc_level2_labels4)
    df_level3_4 = pd.DataFrame(categorical_atc3_4, columns=atc_level3_labels4)
    X_train4 = pd.concat([X_train4, df_level1_4, df_level2_4, df_level3_4], axis = 1)
    X_train4 = np.asarray(X_train4).astype(np.float32)
    encoded_y_train4 = np.asarray(encoded_y_train4).astype(np.float32)
    # Complete NaN values in each column with the median
    X_train4[pd.isna(X_train4)] = np.nanmedian(X_train4)
    # Define an instance of the MinMaxScaler
    scaler4 = MinMaxScaler()
    # Fit the scaler to the data and transform it
    X_train4 = scaler4.fit_transform(X_train4)
    
    rf4 = RandomForestClassifier(n_estimators=random_params['n_estimators'], max_depth=random_params['max_depth'], min_samples_split=random_params['min_samples_split'], min_samples_leaf=random_params['min_samples_leaf'], class_weight = random_params['class_weight'], random_state=seed)

    rf4.fit(X_train4, encoded_y_train4)
    
    #TEST
    X_test3.drop(labels=['ATC_level2'], axis="columns", inplace=True)
    X_test4.drop(labels=['ATC_level3'], axis="columns", inplace = True)

    output = random_predictions(rf1, X_test1, rf2, X_test2, rf3, X_test3, rf4, X_test4)
    predictions = []
    for preds in output:
        interm = []
        for pred in preds:
            clean_pred = pred.replace('<START>', '').replace('<END>', '')
            if len(clean_pred) == 5:
                interm.append(clean_pred)
        predictions.append(interm)
            
    precision_1, precision_2, precision_3, precision_4 = defined_metrics.precision(predictions, "../../Data/val_set.csv", 'ATC Codes')
    recall_1, recall_2, recall_3, recall_4, comp = defined_metrics.recall(predictions, "../../Data/val_set.csv", 'ATC Codes')
    df_tests.iloc[comb, :] = [f"{random_params['n_estimators']}", f"{random_params['max_depth']}", f"{random_params['min_samples_split']}", f"{random_params['min_samples_leaf']}", f"{random_params['class_weight']}", f"{precision_1}", f"{precision_2}", f"{precision_3}", f"{precision_4}", f"{recall_1}", f"{recall_2}", f"{recall_3}", f"{recall_4}", f"{comp}"]
    df_tests.to_csv("randomforest_results.csv", index = False)
sys.stdout = sys.__stdout__

In [11]:
df_tests.sort_values(by = "Precision nivel1")

Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,class_weight,Precision nivel1,Precision nivel2,Precision nivel3,Precision nivel4,Recall nivel1,Recall nivel2,Recall nivel3,Recall nivel4,Drugs that have at least one match
142,50,20,10,4,balanced,0.20327304048234301,0.37058823529411755,0.3917748917748918,0.2222222222222222,0.40766580534022395,0.4211111111111111,0.42207792207792205,0.2222222222222222,"[170, 77, 36, 8]"
145,500,20,10,4,balanced,0.21533161068044765,0.40037243947858475,0.39837398373983735,0.3108108108108108,0.424978466838932,0.42737430167597756,0.4146341463414634,0.35135135135135137,"[179, 82, 37, 13]"
82,50,20,2,4,balanced,0.22049956933677853,0.42602495543672014,0.46099290780141844,0.26595744680851063,0.43996554694229123,0.4746286393345217,0.4485815602836879,0.2872340425531915,"[187, 94, 47, 14]"
57,100,20,5,4,balanced,0.22049956933677858,0.393854748603352,0.5325670498084292,0.20588235294117646,0.42693080677576795,0.44674115456238356,0.5680076628352491,0.23529411764705882,"[179, 87, 51, 12]"
113,500,20,2,4,balanced,0.22222222222222218,0.39336917562724016,0.4578544061302681,0.34848484848484845,0.4368791271892047,0.43201911589008357,0.47701149425287354,0.3977272727272727,"[186, 87, 44, 18]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,50,,2,1,,0.40051679586563316,0.5076335877862597,0.6212121212121213,0.6209677419354839,0.6306345104794718,0.6056509754028837,0.7110101010101011,0.7123655913978494,"[262, 165, 124, 91]"
144,500,40,2,1,,0.40654608096468564,0.5426208651399492,0.6245098039215685,0.5374677002583979,0.642736146999713,0.6147158608990669,0.7157843137254902,0.6589147286821704,"[262, 170, 129, 87]"
76,50,40,2,1,,0.4112833763996557,0.4987468671679199,0.6807228915662651,0.5751879699248121,0.6537898363479758,0.5938178780284044,0.7595381526104418,0.650375939849624,"[266, 166, 133, 90]"
109,100,20,2,1,,0.4194659776055123,0.5082382762991129,0.6734892787524366,0.5259259259259259,0.6411714039621017,0.6269961977186312,0.7543859649122807,0.6308641975308642,"[263, 171, 135, 89]"


In [12]:
(df_tests.sort_values(by = "Precision nivel1")).to_csv("randomforest_sortedresults.csv", index = False)