In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../..')))
from seq2seq import *
import pandas as pd
import numpy as np
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import Counter

In [2]:
# Convert a string that simulates a list to a real list
def convert_string_list(element):
    # Delete [] of the string
    element = element[0:len(element)]
    # Create a list that contains each code as e.g. 'A'
    ATC_list = list(element.split('; '))
    for index, code in enumerate(ATC_list):
        # Delete '' of the code
        ATC_list[index] = code[0:len(code)]
    return ATC_list

In [3]:
df = pd.read_csv('../../../Data/splittedATC.csv')

In [4]:
X = df['Neutralized SMILES']
y = df['ATC Codes']

In [5]:
def set_seeds(seed):
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def create_partitions(df, seed):
    # Create a new column that indicates if the compound has more than 1 ATC code associated (1) or not (0)
    df['multiple_ATC'] = df['ATC Codes'].apply(lambda x: len(convert_string_list(x)) > 1)
    
    # Divide the dataset depending on multiple_ATC column
    group_more_than_one = df[df['multiple_ATC']]  # Compounds with more than one ATC code associated
    group_one = df[~df['multiple_ATC']]          # Compounds with just one ATC code associated

    conteo_longitudes = Counter(len(convert_string_list(codes)) for codes in group_more_than_one['ATC Codes'])
    group_more_than_one = group_more_than_one.reset_index(drop=True)
    group_one = group_one.reset_index(drop=True)

    # Divide each set into train, validation and test subsets
    train_more, test_more = train_test_split(group_more_than_one, test_size=0.2, random_state=seed)
    train_one, test_one = train_test_split(group_one, test_size=0.2, random_state=seed)
    train_more, val_more = train_test_split(train_more, test_size=0.15, random_state=seed)
    train_one, val_one = train_test_split(train_one, test_size=0.15, random_state=seed)
    
    # Combine each set
    train_set = pd.concat([train_more, train_one])
    test_set = pd.concat([test_more, test_one])
    val_set = pd.concat([val_more, val_one])
    train_set = shuffle(train_set, random_state = seed)
    test_set = shuffle(test_set, random_state = seed)
    val_set = shuffle(val_set, random_state = seed)
    return train_set, val_set, test_set

In [6]:
seeds = [42, 123, 47899, 2025, 1, 20, 99, 1020, 345, 78] 

for seed in seeds:
    set_seeds(seed)

    train_set, val_set, test_set = create_partitions(df, seed)
    train_set.to_csv(f'train_set{seed}.csv', index = False)
    test_set.to_csv(f'test_set{seed}.csv', index = False)
    val_set.to_csv(f'val_set{seed}.csv', index = False)