## Summary

### Data split
- Train and validate datasets were merged - cross dataset
- Train and validate datasets were made using the cross dataset.
- It was split into 4 parts and each time the new part was used as validation dsataset.

### Results
- Test rmse: 0.582178 +/- 0.016978
- Test r2: 0.896924 +/- 0.006035

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.neural_network import MLPRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
from sklearn.model_selection import train_test_split
filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_train.csv', index_col=0)
data_valid = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_validation.csv', index_col=0)
data_test = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_test.csv', index_col=0)

In [3]:
data_cross = pd.concat([data_train, data_valid])

In [4]:
def get_morgan_count_fps(data, bits=4096, radius=2):
    X = [AllChem.GetHashedMorganFingerprint(m, radius, nBits=bits) for m in data]
    X_list = []
    for x in X:
        array = np.zeros((0,), dtype=np.int64)
        DataStructs.ConvertToNumpyArray(x, array)
        X_list.append(array)
    X = pd.DataFrame(X_list)
    return X

In [5]:
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test.smiles]
X_test = get_morgan_count_fps(X_test_mol)
y_test = data_test.logP

In [6]:
rmse_values = []
r2_values = []

In [7]:
part_size = len(data_cross) // 4

In [8]:
len(data_cross)

10732

In [9]:
for i in range(4):
    print(f"Fold {i} is started")
    valid_indices = pd.Series([i * part_size <= j < (i + 1) * part_size for j in range(len(data_cross))])
    print(valid_indices)
    train_data = data_cross[~valid_indices]
    valid_data = data_cross[valid_indices]
    X_train_mol = [Chem.MolFromSmiles(smi) for smi in train_data.smiles]
    y_train = train_data.logP
    X_valid_mol = [Chem.MolFromSmiles(smi) for smi in valid_data.smiles]
    y_valid = valid_data.logP
    
    X_train = get_morgan_count_fps(X_train_mol)
    X_valid = get_morgan_count_fps(X_valid_mol)

    print("Counted fingerprints")
    regr = MLPRegressor(random_state=10, max_iter=7)
    regr.fit(X_train, y_train)
    test_rmse = mean_squared_error(y_test, regr.predict(X_test), squared=False)
    test_r2 = r2_score(y_test, regr.predict(X_test))
    rmse_values.append(test_rmse)
    r2_values.append(test_r2)


Fold 0 is started
0         True
1         True
2         True
3         True
4         True
         ...  
10727    False
10728    False
10729    False
10730    False
10731    False
Length: 10732, dtype: bool
Counted fingerprints
Fold 1 is started
0        False
1        False
2        False
3        False
4        False
         ...  
10727    False
10728    False
10729    False
10730    False
10731    False
Length: 10732, dtype: bool
Counted fingerprints
Fold 2 is started
0        False
1        False
2        False
3        False
4        False
         ...  
10727    False
10728    False
10729    False
10730    False
10731    False
Length: 10732, dtype: bool
Counted fingerprints
Fold 3 is started
0        False
1        False
2        False
3        False
4        False
         ...  
10727     True
10728     True
10729     True
10730     True
10731     True
Length: 10732, dtype: bool
Counted fingerprints


In [10]:
rmse_values = pd.Series(rmse_values)
r2_values = pd.Series(r2_values)

In [11]:
rmse_values

0    0.603630
1    0.587504
2    0.571763
3    0.565816
dtype: float64

In [12]:
rmse_values.describe()

count    4.000000
mean     0.582178
std      0.016978
min      0.565816
25%      0.570276
50%      0.579634
75%      0.591536
max      0.603630
dtype: float64

In [13]:
r2_values

0    0.889259
1    0.895097
2    0.900643
3    0.902699
dtype: float64

In [14]:
r2_values.describe()

count    4.000000
mean     0.896924
std      0.006035
min      0.889259
25%      0.893637
50%      0.897870
75%      0.901157
max      0.902699
dtype: float64

## LogP wo averaging

In [5]:
import pandas as pd
import numpy as np
import time
from sklearn.neural_network import MLPRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
from sklearn.model_selection import train_test_split
filterwarnings('ignore')
import os
from sklearn.model_selection import KFold
from tqdm import tqdm

In [6]:
DATA_PATH = '../../../data/3_final_data/split_data/'

RAW_PATH = '../../../data/raw/baselines/morganfp'

FILE_PREFIX = 'logp_wo_averaging'

SMILES_COLUMN = 'smiles'
VALUE_COLUMN = 'logP'

BITS_NUM = 8192

In [7]:
data_train = pd.read_csv(os.path.join(DATA_PATH, FILE_PREFIX+'_train.csv'), index_col=0)
data_valid = pd.read_csv(os.path.join(DATA_PATH, FILE_PREFIX+'_validation.csv'), index_col=0)
data_test = pd.read_csv(os.path.join(DATA_PATH, FILE_PREFIX+'_test.csv'), index_col=0)

In [8]:
data_cross = pd.concat([data_train, data_valid])

In [9]:
def get_morgan_count_fps(data, bits=4096, radius=2):
    X = [AllChem.GetHashedMorganFingerprint(m, radius, nBits=bits) for m in data]
    X_list = []
    for x in tqdm(X):
        array = np.zeros((0,), dtype=np.int64)
        DataStructs.ConvertToNumpyArray(x, array)
        X_list.append(array)
    X = pd.DataFrame(X_list)
    return X

### Bits Number 8192

In [12]:
BITS_NUM = 8192

In [13]:
kf = KFold(n_splits=4, shuffle=False)

In [14]:
rmse_values = []
r2_values = []
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test[SMILES_COLUMN]]
X_test = get_morgan_count_fps(X_test_mol, bits = BITS_NUM)
y_test = data_test[VALUE_COLUMN]
for i, (train_index, val_index) in enumerate(kf.split(data_cross)):
    print(f"Fold {i} is started")
    train_data = data_cross.iloc[train_index]
    valid_data = data_cross.iloc[val_index]
    X_train_mol = [Chem.MolFromSmiles(smi) for smi in train_data[SMILES_COLUMN]]
    y_train = train_data[VALUE_COLUMN]
    X_valid_mol = [Chem.MolFromSmiles(smi) for smi in valid_data[SMILES_COLUMN]]
    y_valid = valid_data[VALUE_COLUMN]
    
    X_train = get_morgan_count_fps(X_train_mol, bits = BITS_NUM)
    X_valid = get_morgan_count_fps(X_valid_mol, bits = BITS_NUM)

    print("Counted fingerprints")
    regr = MLPRegressor(random_state=10, max_iter=7)
    regr.fit(X_train, y_train)
    test_rmse = mean_squared_error(y_test, regr.predict(X_test), squared=False)
    test_r2 = r2_score(y_test, regr.predict(X_test))
    rmse_values.append(test_rmse)
    r2_values.append(test_r2)
    
    test_predictions = pd.DataFrame(columns=[SMILES_COLUMN, VALUE_COLUMN, VALUE_COLUMN+'_pred'])
    test_predictions[SMILES_COLUMN] = data_test[SMILES_COLUMN]
    test_predictions[VALUE_COLUMN] = data_test[VALUE_COLUMN]
    test_predictions[VALUE_COLUMN+'_pred'] = regr.predict(X_test)
    
    test_predictions.to_csv(os.path.join(RAW_PATH, FILE_PREFIX+'_test_predictions_'+str(i)+'.csv'))

100%|██████████| 2067/2067 [00:00<00:00, 4140.70it/s]


Fold 0 is started


100%|██████████| 8782/8782 [00:02<00:00, 3882.67it/s]
100%|██████████| 2928/2928 [00:00<00:00, 4144.10it/s]


Counted fingerprints
Fold 1 is started


100%|██████████| 8782/8782 [00:02<00:00, 4154.18it/s]
100%|██████████| 2928/2928 [00:00<00:00, 4147.85it/s]


Counted fingerprints
Fold 2 is started


100%|██████████| 8783/8783 [00:02<00:00, 4140.13it/s]
100%|██████████| 2927/2927 [00:00<00:00, 4150.70it/s]


Counted fingerprints
Fold 3 is started


100%|██████████| 8783/8783 [00:02<00:00, 4149.21it/s]
100%|██████████| 2927/2927 [00:00<00:00, 4155.18it/s]


Counted fingerprints


In [15]:
rmse_values = pd.Series(rmse_values)
r2_values = pd.Series(r2_values)

In [16]:
rmse_values.describe()

count    4.000000
mean     0.572404
std      0.000803
min      0.571275
25%      0.572119
50%      0.572636
75%      0.572920
max      0.573070
dtype: float64

In [17]:
r2_values.describe()

count    4.000000
mean     0.902705
std      0.000273
min      0.902479
25%      0.902530
50%      0.902627
75%      0.902802
max      0.903089
dtype: float64

### Bits Number 4096

In [18]:
BITS_NUM = 4096

In [19]:
kf = KFold(n_splits=4, shuffle=False)

In [10]:
rmse_values = []
r2_values = []
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test[SMILES_COLUMN]]
X_test = get_morgan_count_fps(X_test_mol, bits = BITS_NUM)
y_test = data_test[VALUE_COLUMN]
for i, (train_index, val_index) in enumerate(kf.split(data_cross)):
    print(f"Fold {i} is started")
    train_data = data_cross.iloc[train_index]
    valid_data = data_cross.iloc[val_index]
    X_train_mol = [Chem.MolFromSmiles(smi) for smi in train_data[SMILES_COLUMN]]
    y_train = train_data[VALUE_COLUMN]
    X_valid_mol = [Chem.MolFromSmiles(smi) for smi in valid_data[SMILES_COLUMN]]
    y_valid = valid_data[VALUE_COLUMN]
    
    X_train = get_morgan_count_fps(X_train_mol, bits = BITS_NUM)
    X_valid = get_morgan_count_fps(X_valid_mol, bits = BITS_NUM)

    print("Counted fingerprints")
    regr = MLPRegressor(random_state=10, max_iter=7)
    regr.fit(X_train, y_train)
    test_rmse = mean_squared_error(y_test, regr.predict(X_test), squared=False)
    test_r2 = r2_score(y_test, regr.predict(X_test))
    rmse_values.append(test_rmse)
    r2_values.append(test_r2)
    
    test_predictions = pd.DataFrame(columns=[SMILES_COLUMN, VALUE_COLUMN, VALUE_COLUMN+'_pred'])
    test_predictions[SMILES_COLUMN] = data_test[SMILES_COLUMN]
    test_predictions[VALUE_COLUMN] = data_test[VALUE_COLUMN]
    test_predictions[VALUE_COLUMN+'_pred'] = regr.predict(X_test)
    
    test_predictions.to_csv(os.path.join(RAW_PATH, FILE_PREFIX+'_test_predictions_'+str(i)+'.csv'))

Fold 0 is started


100%|██████████| 8782/8782 [00:01<00:00, 7322.23it/s]
100%|██████████| 2928/2928 [00:00<00:00, 8048.18it/s]


Counted fingerprints
Fold 1 is started


100%|██████████| 8782/8782 [00:01<00:00, 8138.14it/s]
100%|██████████| 2928/2928 [00:00<00:00, 8098.23it/s]


Counted fingerprints
Fold 2 is started


100%|██████████| 8783/8783 [00:01<00:00, 8128.82it/s]
100%|██████████| 2927/2927 [00:00<00:00, 8113.32it/s]


Counted fingerprints
Fold 3 is started


100%|██████████| 8783/8783 [00:01<00:00, 8121.89it/s]
100%|██████████| 2927/2927 [00:00<00:00, 8109.01it/s]


Counted fingerprints


In [11]:
rmse_values = pd.Series(rmse_values)
r2_values = pd.Series(r2_values)

In [12]:
rmse_values.describe()

count    4.000000
mean     0.596955
std      0.002586
min      0.594699
25%      0.594738
50%      0.596820
75%      0.599036
max      0.599481
dtype: float64

In [13]:
r2_values.describe()

count    4.000000
mean     0.894179
std      0.000917
min      0.893283
25%      0.893441
50%      0.894227
75%      0.894965
max      0.894979
dtype: float64

## LogD Lipophilicity

In [18]:
import pandas as pd
import numpy as np
import time
from sklearn.neural_network import MLPRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
from sklearn.model_selection import train_test_split
filterwarnings('ignore')
import os
from sklearn.model_selection import KFold
from tqdm import tqdm

In [19]:
DATA_PATH = '../../../data/3_final_data/split_data/'

FILE_PREFIX = 'logd_Lip_wo_averaging'

SMILES_COLUMN = 'smiles'
VALUE_COLUMN = 'logD'

In [20]:
data_train = pd.read_csv(os.path.join(DATA_PATH, FILE_PREFIX+'_train.csv'), index_col=0)
data_valid = pd.read_csv(os.path.join(DATA_PATH, FILE_PREFIX+'_validation.csv'), index_col=0)
data_test = pd.read_csv(os.path.join(DATA_PATH, FILE_PREFIX+'_test.csv'), index_col=0)

In [21]:
data_cross = pd.concat([data_train, data_valid])

In [22]:
def get_morgan_count_fps(data, bits=4096, radius=2):
    X = [AllChem.GetHashedMorganFingerprint(m, radius, nBits=bits) for m in data]
    X_list = []
    for x in tqdm(X):
        array = np.zeros((0,), dtype=np.int64)
        DataStructs.ConvertToNumpyArray(x, array)
        X_list.append(array)
    X = pd.DataFrame(X_list)
    return X

### Bits Number 8192

In [23]:
BITS_NUM = 8192

In [24]:
kf = KFold(n_splits=4, shuffle=False)

In [25]:
rmse_values = []
r2_values = []
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test[SMILES_COLUMN]]
X_test = get_morgan_count_fps(X_test_mol, bits = BITS_NUM)
y_test = data_test[VALUE_COLUMN]
for i, (train_index, val_index) in enumerate(kf.split(data_cross)):
    print(f"Fold {i} is started")
    train_data = data_cross.iloc[train_index]
    valid_data = data_cross.iloc[val_index]
    X_train_mol = [Chem.MolFromSmiles(smi) for smi in train_data[SMILES_COLUMN]]
    y_train = train_data[VALUE_COLUMN]
    X_valid_mol = [Chem.MolFromSmiles(smi) for smi in valid_data[SMILES_COLUMN]]
    y_valid = valid_data[VALUE_COLUMN]
    
    X_train = get_morgan_count_fps(X_train_mol, bits = BITS_NUM)
    X_valid = get_morgan_count_fps(X_valid_mol, bits = BITS_NUM)

    print("Counted fingerprints")
    regr = MLPRegressor(random_state=10, max_iter=7)
    regr.fit(X_train, y_train)
    test_rmse = mean_squared_error(y_test, regr.predict(X_test), squared=False)
    test_r2 = r2_score(y_test, regr.predict(X_test))
    rmse_values.append(test_rmse)
    r2_values.append(test_r2)
    
    test_predictions = pd.DataFrame(columns=[SMILES_COLUMN, VALUE_COLUMN, VALUE_COLUMN+'_pred'])
    test_predictions[SMILES_COLUMN] = data_test[SMILES_COLUMN]
    test_predictions[VALUE_COLUMN] = data_test[VALUE_COLUMN]
    test_predictions[VALUE_COLUMN+'_pred'] = regr.predict(X_test)
    
    test_predictions.to_csv(os.path.join(RAW_PATH, FILE_PREFIX+'_test_predictions_'+str(i)+'.csv'))

100%|██████████| 625/625 [00:00<00:00, 4011.13it/s]


Fold 0 is started


100%|██████████| 2655/2655 [00:00<00:00, 4040.91it/s]
100%|██████████| 886/886 [00:00<00:00, 4032.97it/s]


Counted fingerprints
Fold 1 is started


100%|██████████| 2656/2656 [00:00<00:00, 4044.46it/s]
100%|██████████| 885/885 [00:00<00:00, 4024.94it/s]


Counted fingerprints
Fold 2 is started


100%|██████████| 2656/2656 [00:00<00:00, 4058.15it/s]
100%|██████████| 885/885 [00:00<00:00, 4037.43it/s]


Counted fingerprints
Fold 3 is started


100%|██████████| 2656/2656 [00:00<00:00, 4046.42it/s]
100%|██████████| 885/885 [00:00<00:00, 4036.08it/s]


Counted fingerprints


In [32]:
rmse_values = pd.Series(rmse_values)
r2_values = pd.Series(r2_values)

In [33]:
rmse_values.describe()

count    4.000000
mean     0.774683
std      0.022025
min      0.745228
25%      0.765202
50%      0.778629
75%      0.788110
max      0.796246
dtype: float64

In [34]:
r2_values.describe()

count    4.000000
mean     0.584684
std      0.023453
min      0.561507
25%      0.570407
50%      0.580664
75%      0.594940
max      0.615899
dtype: float64

### Bits Number 4096

In [35]:
BITS_NUM = 4096

In [36]:
kf = KFold(n_splits=4, shuffle=False)

In [37]:
rmse_values = []
r2_values = []
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test[SMILES_COLUMN]]
X_test = get_morgan_count_fps(X_test_mol, bits = BITS_NUM)
y_test = data_test[VALUE_COLUMN]
for i, (train_index, val_index) in enumerate(kf.split(data_cross)):
    print(f"Fold {i} is started")
    train_data = data_cross.iloc[train_index]
    valid_data = data_cross.iloc[val_index]
    X_train_mol = [Chem.MolFromSmiles(smi) for smi in train_data[SMILES_COLUMN]]
    y_train = train_data[VALUE_COLUMN]
    X_valid_mol = [Chem.MolFromSmiles(smi) for smi in valid_data[SMILES_COLUMN]]
    y_valid = valid_data[VALUE_COLUMN]
    
    X_train = get_morgan_count_fps(X_train_mol, bits = BITS_NUM)
    X_valid = get_morgan_count_fps(X_valid_mol, bits = BITS_NUM)

    print("Counted fingerprints")
    regr = MLPRegressor(random_state=10, max_iter=7)
    regr.fit(X_train, y_train)
    test_rmse = mean_squared_error(y_test, regr.predict(X_test), squared=False)
    test_r2 = r2_score(y_test, regr.predict(X_test))
    rmse_values.append(test_rmse)
    r2_values.append(test_r2)
    
    test_predictions = pd.DataFrame(columns=[SMILES_COLUMN, VALUE_COLUMN, VALUE_COLUMN+'_pred'])
    test_predictions[SMILES_COLUMN] = data_test[SMILES_COLUMN]
    test_predictions[VALUE_COLUMN] = data_test[VALUE_COLUMN]
    test_predictions[VALUE_COLUMN+'_pred'] = regr.predict(X_test)
    
    test_predictions.to_csv(os.path.join(RAW_PATH, FILE_PREFIX+'_test_predictions_'+str(i)+'.csv'))

100%|██████████| 625/625 [00:00<00:00, 7740.69it/s]


Fold 0 is started


100%|██████████| 2655/2655 [00:00<00:00, 7755.72it/s]
100%|██████████| 886/886 [00:00<00:00, 7737.96it/s]


Counted fingerprints
Fold 1 is started


100%|██████████| 2656/2656 [00:00<00:00, 7732.47it/s]
100%|██████████| 885/885 [00:00<00:00, 7656.79it/s]


Counted fingerprints
Fold 2 is started


100%|██████████| 2656/2656 [00:00<00:00, 7808.33it/s]
100%|██████████| 885/885 [00:00<00:00, 7740.52it/s]


Counted fingerprints
Fold 3 is started


100%|██████████| 2656/2656 [00:00<00:00, 7809.01it/s]
100%|██████████| 885/885 [00:00<00:00, 7758.27it/s]


Counted fingerprints


In [38]:
rmse_values = pd.Series(rmse_values)
r2_values = pd.Series(r2_values)

In [39]:
rmse_values.describe()

count    4.000000
mean     0.778364
std      0.020927
min      0.751912
25%      0.766415
50%      0.783042
75%      0.794990
max      0.795458
dtype: float64

In [40]:
r2_values.describe()

count    4.000000
mean     0.580755
std      0.022428
min      0.562374
25%      0.562889
50%      0.575834
75%      0.593699
max      0.608977
dtype: float64