In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

In [2]:
from descriptastorus.descriptors import rdNormalizedDescriptors
from rdkit import Chem
import logging

# make the normalized descriptor generator
generator = rdNormalizedDescriptors.RDKit2DNormalized()
generator.columns # list of tuples:  (descriptor_name, numpytype) ...

# features = generator.process(smiles)
# features[0] is True/False if the smiles could be processed correcty
# features[1:] are the normalized descriptors as described in generator.columns

# example for converting a smiles string into the values
def rdkit_2d_normalized_features(smiles: str):
    # n.b. the first element is true/false if the descriptors were properly computed
    results = generator.process(smiles)
    processed, features = results[0], results[1:]
    if processed is None:
       logging.warning("Unable to process smiles %s", smiles)
    # if processed is None, the features are are default values for the type
    return features

In [3]:
ERROR_COLUMN = 'Absolute Error'
SMILES_COLUMN = 'smiles'
VALUE_COLUMN = 'logP'
PREDS_COLUMN = 'logP_pred'

DATASET_INPUT_PATH = '../../../data/3_final_data/split_data'

DATASET_OUTPUT_PATH = '../../../data/raw/baselines/dmpnn'

In [4]:
train_data = pd.read_csv(os.path.join(DATASET_INPUT_PATH, 'logp_wo_averaging_train.csv'))
val_data = pd.read_csv(os.path.join(DATASET_INPUT_PATH, 'logp_wo_averaging_validation.csv'))
test_data = pd.read_csv(os.path.join(DATASET_INPUT_PATH, 'logp_wo_averaging_test.csv'))

In [5]:
def create_feature_dataframe(df):
    import numpy as np
    rdkit_table = []
    features_names = [gen[0] for gen in generator.columns]
    smiles_index_dict = {}
    for i in tqdm(range(df.shape[0])):
        smiles = df.iloc[i][SMILES_COLUMN]
        logP = df.iloc[i][VALUE_COLUMN]
        features = {features_names[j]:feature for j,feature in enumerate(rdkit_2d_normalized_features(smiles))}
        features[SMILES_COLUMN] = smiles
        features[VALUE_COLUMN] = logP
        rdkit_table.append(features)
        smiles_index_dict[smiles]=i
    rdkit_features = pd.DataFrame(rdkit_table)
    return rdkit_features, smiles_index_dict

In [6]:
train_data_rdkit, train_smiles_dict = create_feature_dataframe(train_data)
val_data_rdkit, val_smiles_dict = create_feature_dataframe(val_data)
test_data_rdkit, test_smiles_dict = create_feature_dataframe(test_data)

100%|██████████| 9643/9643 [05:39<00:00, 28.38it/s]
100%|██████████| 2067/2067 [01:13<00:00, 28.15it/s]
100%|██████████| 2067/2067 [01:13<00:00, 28.27it/s]


In [7]:
import json

train_data_rdkit.to_csv(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_train_drkit_feat.csv'))
with open(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_train_smiles_dict.json'), 'w') as f:
    json.dump(train_smiles_dict, f)
val_data_rdkit.to_csv(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_val_drkit_feat.csv'))
with open(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_val_smiles_dict.json'), 'w') as f:
    json.dump(val_smiles_dict, f)
test_data_rdkit.to_csv(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_test_drkit_feat.csv'))
with open(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_test_smiles_dict.json'), 'w') as f:
    json.dump(test_smiles_dict, f)


# Train model

In [8]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

In [9]:
VALUE_COLUMN = 'logP'
SMILES_COLUMN = 'smiles'

DATA_PATH = '../../../data/raw/baselines/dmpnn'

In [10]:
train_data = pd.read_csv(os.path.join(DATA_PATH, 'logp_wo_averaging_train_drkit_feat.csv'), index_col=0)
val_data = pd.read_csv(os.path.join(DATA_PATH, 'logp_wo_averaging_val_drkit_feat.csv'), index_col=0)
test_data = pd.read_csv(os.path.join(DATA_PATH, 'logp_wo_averaging_test_drkit_feat.csv'), index_col=0)

In [11]:
y_train = train_data[VALUE_COLUMN]
X_train = train_data.drop([VALUE_COLUMN, SMILES_COLUMN], axis = 1)
X_train = X_train.fillna(0)

y_val = val_data[VALUE_COLUMN]
X_val = val_data.drop([VALUE_COLUMN, SMILES_COLUMN], axis = 1)
X_val = X_val.fillna(0)


y_test = test_data[VALUE_COLUMN]
X_test = test_data.drop([VALUE_COLUMN, SMILES_COLUMN], axis = 1)
X_test = X_test.fillna(0)

## XGBoost

In [18]:
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [19]:
model = XGBRegressor(n_estimators = 100, max_depth = 6, random_state = 42)

In [20]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
test_preds = model.predict(X_test)
val_preds = model.predict(X_val)

In [22]:
print("Valid RMSE =", mean_squared_error(y_val, val_preds, squared=False))
print("Valid R2-score is {0}".format(r2_score(y_val, val_preds)))

print("Test RMSE =", mean_squared_error(y_test, test_preds, squared=False))
print("Test R2-score is {0}".format(r2_score(y_test, test_preds)))

Valid RMSE = 0.550378750673703
Valid R2-score is 0.9101585772785692
Test RMSE = 0.5765899578624547
Test R2-score is 0.9012773315417384


In [23]:
feature_importance = model.feature_importances_

In [24]:
features = list(X_train)

In [25]:
df = pd.DataFrame(columns = ['feature importance', 'feature name'])

In [26]:
df['feature importance'] = feature_importance
df['feature name'] = features
df_20_most_important = df.sort_values(by='feature importance', ascending = False)[:20]
with open(os.path.join(DATA_PATH,'experiments_result'),'w') as f:
    f.write(df_20_most_important.reset_index(drop=True).to_markdown())

# MLP

In [27]:
scaler = StandardScaler()

In [28]:
X_train_scaled = scaler.fit_transform(X_train)

In [29]:
regr = MLPRegressor(random_state=10, max_iter=100, warm_start=True, early_stopping = True, solver = 'lbfgs')

In [30]:
regr.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor(early_stopping=True, max_iter=100, random_state=10, solver='lbfgs',
             warm_start=True)

In [31]:
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val) 

In [32]:
test_preds = regr.predict(X_test_scaled)
val_preds = regr.predict(X_val_scaled)

In [33]:
print("Valid RMSE =", mean_squared_error(y_val, val_preds, squared=False))
print("Valid R2-score is {0}".format(r2_score(y_val, val_preds)))

print("Test RMSE =", mean_squared_error(y_test, test_preds, squared=False))
print("Test R2-score is {0}".format(r2_score(y_test, test_preds)))

Valid RMSE = 0.5670377404502589
Valid R2-score is 0.9046375862326349
Test RMSE = 0.5642646297928993
Test R2-score is 0.9054528606765462
