In [1]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import r2_score


In [2]:
target_unit_pro_id = "P42345" # mTOR
binding_db_path = "BindingDB_mTOR.tsv"
output_path = "P42345_ligand_binding.pkl"

output_path = f'{target_unit_pro_id}_xgbr_ligand_model.pkl'
binddb = pd.read_table(binding_db_path, sep="\t",header=0,low_memory=False)

In [None]:
d = binddb[binddb['UniProt (SwissProt) Primary ID of Target Chain']==target_unit_pro_id]
d = d[['Ligand SMILES','IC50 (nM)']]
d.columns = ['smiles','ic50']
d = d.dropna(subset=['ic50'])
print(d)

In [None]:
d['ic50'] = d['ic50'].str.replace('[><=]', '', regex=True)
d['ic50'] = d['ic50'].astype(float)
print(d)

In [None]:
d = d[(d['ic50'] <= 1000)]

print(f'Number of obs: {d.shape[0]}:')
print(d)

In [None]:
vs = []
for i in d[['ic50']].values:
    t = -np.log10(i*1E-9) 
    vs.append(t)


d['metric_value'] = vs
d = d[['smiles','metric_value']]
d['metric_value'] = d['metric_value'].astype(float)
d = d[d['metric_value'] != np.inf]
d = d.drop_duplicates(subset='smiles')
d = d.dropna()

print(f'Number of obs: {d.shape[0]}:')

if d.shape[0]<10:
    print('Less than 10 compound-target pairs. Not fitting a model')
    print("errorerror")
# convert to fingerprint
fps = []
values = []
fp_smiles = []
for x,y in d[['smiles','metric_value']].values:
    try:
        fp = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x),2)
        fp_smiles.append(x)
    except:
        continue
    
    fps.append(fp)
    values.append(y)

X = np.array(fps)
y = np.array(values)

In [None]:
xgb_regr = XGBRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
xgb_regr.fit(X, y)

with open(output_path, 'wb') as handle:
    s = pickle.dump(xgb_regr, handle)