In [1]:
import pandas as pd
import os
import numpy as np
import yaml

In [2]:
def prepare_dataset(tar_list):
    df_blank = pd.DataFrame({'smiles':[]})
    for dataset in tar_list:
        df0 = pd.read_csv(os.path.join(DATASET_PATH, dataset))
        df_blank =  pd.merge(df_blank, df0, on=SMILES_COLUMN, how='outer')
    return df_blank

In [3]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/raw/baselines/dmpnn'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']

In [11]:
dataset = prepare_dataset(DATASET_NAMES)
dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_logd_Lip_wo_averaging.csv'))

In [7]:
dataset.shape

(17685, 3)

In [4]:
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

## Check intersected molecules

In [31]:
dataset_logP[dataset_logP[SMILES_COLUMN].isin(dataset_logD[SMILES_COLUMN])]

Unnamed: 0,smiles,logP
95,C#CCN(C)C(C)Cc1ccccc1,2.90
443,C=CCOc1ccccc1OCC(O)CNC(C)C,2.10
453,C=CCc1ccccc1OCC(O)CNC(C)C,3.10
612,CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O,2.60
617,CC(=O)CCCCn1c(=O)c2c(ncn2C)n(C)c1=O,0.29
...,...,...
13618,c1ccc2[nH]ncc2c1,1.77
13708,c1ccc2ccccc2c1,3.30
13709,c1ccc2cnccc2c1,2.08
13717,c1ccc2ncccc2c1,2.03


## Split dataset

In [12]:
import pandas as pd
import os

In [13]:
DATASET_OUTPUT_PATH = '../data/raw/baselines/dmpnn'

In [14]:
def train_test_validation_split(df):
    from sklearn.model_selection import train_test_split
    train_data, rest_data = train_test_split(df, test_size=0.3)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [15]:
file = 'logp_logd_Lip_wo_averaging'

In [16]:
data = pd.read_csv(os.path.join(DATASET_OUTPUT_PATH, file+'.csv'))
DATASET_PATH = '../data/3_final_data'

data.to_csv(os.path.join(DATASET_PATH, file+'.csv'))

print(file, 'shape: ', data.shape)    
train, validation, test = train_test_validation_split(data)
print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logp_logd_Lip_wo_averaging shape:  (17685, 4)
SPLITTED SHAPES:
	train: (12379, 4)
	validation: (2653, 4)
	test: (2653, 4)



# Datasets correlation

In [6]:
DATASET_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']
file = 'logp_logd_Lip_wo_averaging'

data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [10]:
logP_logD_cross_section = data.dropna()

In [19]:
print('Pearson\'s r LogP/LogD', \
      round(logP_logD_cross_section[VALUE_COLUMNS[0]].corr(logP_logD_cross_section[VALUE_COLUMNS[1]]), 3))
print('Spearman\'s rho LogP/LogD', \
      round(logP_logD_cross_section[VALUE_COLUMNS[0]].corr(logP_logD_cross_section[VALUE_COLUMNS[1]], method='spearman'), 3))
print('Kendall\'s tau LogP/LogD', \
      round(logP_logD_cross_section[VALUE_COLUMNS[0]].corr(logP_logD_cross_section[VALUE_COLUMNS[1]], method='kendall'), 3))

Pearson's r LogP/LogD 0.656
Spearman's rho LogP/LogD 0.656
Kendall's tau LogP/LogD 0.51
