# Install RDKit and molvs

In [None]:
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda config --set always_yes yes --set changeps1 no
!conda install -q -y -c conda-forge python=3.7
!conda install -q -y -c conda-forge rdkit==2020.09.2 

In [None]:
!chmod +x Miniconda3-py37_4.8.3-Linux-x86_64.sh
!time bash ./Miniconda3-py37_4.8.3-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
pip install molvs

## Importing the libraries

In [None]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors 
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Implement RDKit 2D descriptors calculator

In [None]:
class RDKit_2D:
    def __init__(self, smiles):
        self.mols = [Chem.MolFromSmiles(i) for i in smiles]
        self.smiles = smiles
        
    def compute_2Drdkit(self, name):
        rdkit_2d_desc = []
        calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        header = calc.GetDescriptorNames()
        for i in range(len(self.mols)):
            ds = calc.CalcDescriptors(self.mols[i])
            rdkit_2d_desc.append(ds)
        df = pd.DataFrame(rdkit_2d_desc,columns=header)
        df.insert(loc=0, column='smiles', value=self.smiles)
        df.to_csv(name[:-4]+'_RDKit_2D.csv', index=False)
        

# Load dataset
Dataset is the logP dataset from OpenChem

In [None]:
df1 = pd.read_csv('logP_labels.csv')


# Calculate 2D descriptors

In [None]:
def main():
    filename = 'logP_labels.csv'  # path to your csv file
    df = pd.read_csv(filename)               # read the csv file as pandas data frame
    
    smiles = [standardize_smiles(i) for i in df['SMILES'].values]  

    ## Compute RDKit_2D Fingerprints and export a csv file.
    RDKit_descriptor = RDKit_2D(smiles)        # create your RDKit_2D object and provide smiles
    RDKit_descriptor.compute_2Drdkit(filename) # compute RDKit_2D and provide the name of your desired output file. you can use the same name as the input file because the RDKit_2D class will ensure to add "_RDKit_2D.csv" as part of the output file.


In [None]:
if __name__ == '__main__':
    main()

# Load calculated dataset and calculated descriptors

In [3]:
dataset= pd.read_csv('logP_labels_RDKit_2D.csv')

In [4]:
df1 = pd.read_csv('logP_labels.csv')

In [5]:
dataset['exp_logP'] = df1['Kow']

In [6]:
X = dataset.iloc[:, 1:-1].values

y = dataset.iloc[:, -1].values

In [None]:
print (X)

In [None]:
print(y)

## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the XGBoost Regression model on the whole dataset

In [None]:
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

## Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 2.79  3.49]
 [-0.64 -0.89]
 [-0.19 -0.59]
 ...
 [ 2.67  2.35]
 [ 0.26  0.88]
 [ 1.    1.23]]


## Evaluating the Model Performance

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8734809271921917

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

0.4293477630849853