# Baseline methods using fingerprints

## Data preprocessing

In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.neural_network import MLPRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, r2_score
from warnings import filterwarnings
filterwarnings('ignore')

*Choosing the dataset*

In [5]:
data_train = pd.read_csv('../../../data/3_final_data/split_data/logp_wo_averaging_train.csv', index_col=0)
data_valid = pd.read_csv('../../../data/3_final_data/split_data/logp_wo_averaging_validation.csv', index_col=0)
data_test = pd.read_csv('../../../data/3_final_data/split_data/logp_wo_averaging_test.csv', index_col=0)

In [3]:
data_train = pd.read_csv('../../../data/3_final_data/split_data/logP_mean_train.csv', index_col=0)
data_valid = pd.read_csv('../../../data/3_final_data/split_data/logP_mean_validation.csv', index_col=0)
data_test = pd.read_csv('../../../data/3_final_data/split_data/logP_mean_test.csv', index_col=0)

In [4]:
data_train = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_train.csv', index_col=0)
data_valid = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_validation.csv', index_col=0)
data_test = pd.read_csv('../../../data/3_final_data/split_data/logP_wo_parameters_test.csv', index_col=0)

*Getting X/y dataframes*

In [6]:
y_train = data_train.logP
y_valid = data_valid.logP
y_test = data_test.logP

In [7]:
X_train_mol = [Chem.MolFromSmiles(x) for x in data_train.smiles]
X_valid_mol = [Chem.MolFromSmiles(x) for x in data_valid.smiles]
X_test_mol = [Chem.MolFromSmiles(x) for x in data_test.smiles]

## Mean predictor

In [7]:
y_predicted = pd.Series(y_train.mean(), index=[i for i in range(y_test.size)])
print("RMSE of mean predictor is {0}".format(mean_squared_error(y_test, y_predicted, squared=False)))
print("R2-score of mean predictor is {0}".format(r2_score(y_test, y_predicted)))

RMSE of mean predictor is 1.8141849992522239
R2-score of mean predictor is -0.0002990500750039704


## Morgan bit fingerprint + MLPRegressor. Without pH.

In [8]:
def get_morgan_bit_fps(data, bits=2048, radius=2):
    X = [[c for c in AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=bits).ToBitString()] for m in data]
    X = pd.DataFrame(X)
    return X

In [9]:
best_params = [-1, 10^3, -1, -1]
for radius in range(2, 5):
    for bits_degree in range(9, 14):
        
        start_time = time.time()
        print('-' * 25)
        bits = 2 ** bits_degree
        
        X_train = get_morgan_bit_fps(X_train_mol, bits, radius)
        X_valid = get_morgan_bit_fps(X_valid_mol, bits, radius)
        
        regr = MLPRegressor(random_state=10, max_iter=1, warm_start=True)
        
        valid_rmse = 10 ** 3
        rmse = mean_squared_error(y_valid, regr.fit(X_train, y_train).predict(X_valid), squared = False)
        epoch_count = 1
        while valid_rmse > rmse:
            valid_rmse = rmse
            train_rmse = mean_squared_error(y_train, regr.predict(X_train), squared = False)
            epoch_count += 1
            rmse = mean_squared_error(y_valid, regr.fit(X_train, y_train).predict(X_valid), squared = False)
        
        if rmse < best_params[1]:
            best_params = [regr, rmse, bits, radius]

        seconds = time.time() - start_time
        print("Fitting with params radius={0}, bits={1} finished. RMSE={2} was achieved after {3} epochs in {4:.2f} seconds\n".
              format(radius, bits, rmse, epoch_count, seconds))

-------------------------
Fitting with params radius=2, bits=512 finished. RMSE=0.8555273709396101 was achieved after 17 epochs in 78.89 seconds

-------------------------
Fitting with params radius=2, bits=1024 finished. RMSE=0.7925866039776233 was achieved after 16 epochs in 151.17 seconds

-------------------------
Fitting with params radius=2, bits=2048 finished. RMSE=0.7789708908981519 was achieved after 10 epochs in 203.89 seconds

-------------------------
Fitting with params radius=2, bits=4096 finished. RMSE=0.7778858735872698 was achieved after 9 epochs in 327.35 seconds

-------------------------
Fitting with params radius=2, bits=8192 finished. RMSE=0.7395358720990214 was achieved after 5 epochs in 255.92 seconds

-------------------------
Fitting with params radius=3, bits=512 finished. RMSE=0.9336196929580187 was achieved after 13 epochs in 31.44 seconds

-------------------------
Fitting with params radius=3, bits=1024 finished. RMSE=0.8596371495476879 was achieved after

In [10]:
print("Best params for bit fingerprints are: radius = {2}, bits count = {1}".format(*best_params[1:]))
print("Fitting test data...")
X_train = get_morgan_bit_fps(X_train_mol, *(best_params[2:]))
X_valid = get_morgan_bit_fps(X_valid_mol, *(best_params[2:]))
X_test = get_morgan_bit_fps(X_test_mol, *(best_params[2:]))

y_predicted = best_params[0].predict(X_train)
print("Train RMSE =", mean_squared_error(y_train, y_predicted, squared=False))
print("Train R2-score is {0}".format(r2_score(y_train, y_predicted)))
train_results = pd.DataFrame({'smiles': data_train.smiles, 'logp': y_train, 'logp_pred': y_predicted})

y_predicted = best_params[0].predict(X_valid)
print("Valid RMSE =", mean_squared_error(y_valid, y_predicted, squared=False))
print("Valid R2-score is {0}".format(r2_score(y_valid, y_predicted)))
valid_results = pd.DataFrame({'smiles': data_valid.smiles, 'logp': y_valid, 'logp_pred': y_predicted})

y_predicted = best_params[0].predict(X_test)
print("Test RMSE =", mean_squared_error(y_test, y_predicted, squared=False))
print("Test R2-score is {0}".format(r2_score(y_test, y_predicted)))
test_results = pd.DataFrame({'smiles': data_test.smiles, 'logp': y_test, 'logp_pred': y_predicted})

Best params for bit fingerprints are: radius = 2, bits count = 8192
Fitting test data...
Train RMSE = 0.4510724090123696
Train R2-score is 0.9423900610278957
Valid RMSE = 0.7395358720990214
Valid R2-score is 0.8377921775077246
Test RMSE = 0.7215209634809493
Test R2-score is 0.8454102677631433


In [12]:
train_results.to_csv("../../../data/4_best_predictions/bit_fingerprint_predictions_train.csv", index=False)
valid_results.to_csv("../../../data/4_best_predictions/bit_fingerprint_predictions_valid.csv", index=False)
test_results.to_csv("../../../data/4_best_predictions/bit_fingerprint_predictions_test.csv", index=False)

## Morgan bit fingerprint + MLPRegressor. With pH.
*Best params without pH - radius = 2, bits count = 2048, so taking them*

In [None]:
X_train = get_morgan_bit_fps(X_train_mol, 2048, 2)
X_train[2048] = data_train.pH_range
X_valid = get_morgan_bit_fps(X_valid_mol, 2048, 2)
X_valid[2048] = data_valid.pH_range
regr = MLPRegressor(random_state=10, early_stopping=True).fit(X_train, y_train)
y_predicted = regr.predict(X_valid)
rmse = mean_squared_error(y_valid, y_predicted, squared=False)
print(rmse)

In [None]:
X_test = get_morgan_bit_fps(X_test_mol, 2048, 2)
X_test[2048] = data_valid.pH_range
y_predicted = regr.predict(X_test)
print(mean_squared_error(y_test, y_predicted, squared=False))
print("R2-score is {0}".format(r2_score(y_test, y_predicted)))