In [11]:
import pandas as pd
import os
import numpy as np
import json
from sklearn.model_selection import KFold
from random import Random

# LogP 1 split

In [2]:
DATASET_PATH = '../../../data/3_final_data/split_data'

In [6]:
dataset_train = pd.read_csv(os.path.join(DATASET_PATH, 'logp_mean_train.csv'), index_col=0)
dataset_val = pd.read_csv(os.path.join(DATASET_PATH, 'logp_mean_validation.csv'), index_col=0)
dataset_test = pd.read_csv(os.path.join(DATASET_PATH, 'logp_mean_test.csv'), index_col=0)

train_indexes = list(map(lambda x: int(x),list(range(len(dataset_train)))))
val_indexes = list(map(lambda x: int(x),list(np.array(range(len(dataset_val)))+1+int(train_indexes[-1]))))
test_indexes = list(map(lambda x: int(x),list(np.array(range(len(dataset_test)))+1+int(val_indexes[-1]))))

split = {'train':train_indexes, 'val': val_indexes, 'test':test_indexes}

In [8]:
dataset = pd.concat([dataset_train, dataset_val, dataset_test], ignore_index=True)

In [9]:
DATASET_OUTPUT_PATH = '../../../data/raw/baselines/otgnn/logp'

In [10]:
dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'raw.csv'), index=False, header = False)

In [11]:
with open(os.path.join(DATASET_OUTPUT_PATH,'split_0.json'), 'w') as f:
    json.dump(split, f)

In [12]:
def create_dataset(dataset_input_path, prefix_name, dataset_output_path):
    dataset_train = pd.read_csv(os.path.join(dataset_input_path, prefix_name+'_train.csv'), index_col=0)
    dataset_val = pd.read_csv(os.path.join(dataset_input_path, prefix_name+'_validation.csv'), index_col=0)
    dataset_test = pd.read_csv(os.path.join(dataset_input_path, prefix_name+'_test.csv'), index_col=0)
    
    train_indexes = list(map(lambda x: int(x),list(range(len(dataset_train)))))
    val_indexes = list(map(lambda x: int(x),list(np.array(range(len(dataset_val)))+1+int(train_indexes[-1]))))
    test_indexes = list(map(lambda x: int(x),list(np.array(range(len(dataset_test)))+1+int(val_indexes[-1]))))

    split = {'train':train_indexes, 'val': val_indexes, 'test':test_indexes}
    
    dataset.to_csv(os.path.join(dataset_output_path, 'raw.csv'), index=False, header = False)
    with open(os.path.join(dataset_output_path,'split_0.json'), 'w') as f:
        json.dump(split, f)

In [13]:
create_dataset('../../../data/raw/baselines/otgnn/logp_wo_par/raw', prefix_name='logP_wo_parameters', dataset_output_path='../../../data/raw/baselines/otgnn/logp_wo_par/')

In [14]:
create_dataset('../../../data/raw/baselines/otgnn/logp_wo_aver/raw', prefix_name='logp_wo_averaging', dataset_output_path='../../../data/raw/baselines/otgnn/logp_wo_aver/')

# Several splits

In [16]:
def create_dataset(dataset_input_path, prefix_name, dataset_output_path, num_splits):
    dataset = pd.read_csv(os.path.join(dataset_input_path, prefix_name+'.csv'))
    
    kf = KFold(n_splits=num_splits, shuffle=True, random_state = 42)

    fold_num = 0
    for train_val_index, test_indexes in kf.split(dataset):
        

        random = Random(42)


        random.shuffle(train_val_index)

        train_size = int(0.85 * len(train_val_index))
        train_val_size = int(len(train_val_index))

        train_indexes = train_val_index[:train_size]
        val_indexes = train_val_index[train_size:train_val_size]

        train_indexes = list(map(lambda x: int(x), train_indexes))
        val_indexes = list(map(lambda x: int(x), val_indexes))
        test_indexes = list(map(lambda x: int(x), test_indexes))

        
        split = {'train':train_indexes, 'val': val_indexes, 'test': test_indexes}

        dataset.to_csv(os.path.join(dataset_output_path, 'raw.csv'), index=False, header = False)
        with open(os.path.join(dataset_output_path,'split_'+str(fold_num)+'.json'), 'w') as f:
            json.dump(split, f)
        fold_num+=1

In [17]:
create_dataset('../../../data/3_final_data', prefix_name='logp_wo_logp_json_wo_averaging', dataset_output_path='../../../data/raw/baselines/otgnn/logp_wo_json/', num_splits = 5)