### Trainig the fitness model for the activity prediction

In [None]:
%cd ${PATH_TO_JANUS}

[Errno 2] No such file or directory: '/homes/USERNAME_PLACEHOLDER/JANUS'
/


In [242]:
import numpy as np
import pickle
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from rdkit import Chem
from rdkit.Chem import AllChem

RANDOM_STATE = 42 # np.random.randint(0, 1e6)

In [243]:
df = pd.read_pickle(r'BCL2_P10415.pkl')
df = df.loc[df['activity_type'] == 'Ki']
df = df.drop_duplicates(subset=['smiles'], keep='first')
df.loc[df['cmpd_id'] == 'CHEMBL3952489']['smiles'].item()

'CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(OCC5CN(C6CC6)CCO5)c([N+](=O)[O-])c4)c(Oc4cnc5[nH]ccc5c4)c3)CC2)=C(c2ccc(Cl)cc2)C1'

In [None]:
df['smiles'].to_csv('./activity_prediction/temp.smi', index=False, header=False)

In [174]:
# make a filter of columns
df_filtered = df.drop(columns=['cmpd_id', 'uniprot_id', 'activity_type'], inplace=False)
df_filtered_aciton_type = pd.get_dummies(df_filtered['action_type']).astype(float)
df_filtered = pd.concat([df_filtered, df_filtered_aciton_type], axis=1)
df_filtered = df_filtered.drop(columns=['action_type'], inplace=False)
df_filtered = df_filtered.dropna()

df_filtered_X = df_filtered.drop(columns=['pchembl_value'], inplace=False)
df_filtered_y = df_filtered['pchembl_value']
X_train, X_test, y_train, y_test = train_test_split(df_filtered_X, df_filtered_y, test_size=0.2, random_state=RANDOM_STATE)

X_fps_train = np.array([AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2, nBits=2048).ToList() for x in X_train['smiles']])
X_fps_test = np.array([AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2, nBits=2048).ToList() for x in X_test['smiles']])
X_train = X_train.drop(columns=['smiles'], inplace=False).values
X_test = X_test.drop(columns=['smiles'], inplace=False).values

### Scaling

In [4]:
scaler = StandardScaler()
X_descriptors_train = scaler.fit_transform(X_train)
X_descriptors_test = scaler.transform(X_test)

#### Merging

In [5]:
X_merged_train = np.concatenate((X_descriptors_train, X_fps_train), axis=1)
X_merged_test = np.concatenate((X_descriptors_test, X_fps_test), axis=1)

### Modeling

In [6]:
svr_model = SVR(kernel='poly', C=1e3, gamma=0.1)
svr_model.fit(X_fps_train, y_train)
preds = svr_model.predict(X_fps_test)
r2_score_val = r2_score(y_test, preds)
rmse_val = mean_squared_error(y_test, preds)
print(f'R2 score of the SVR model on the grouped data is {r2_score_val}')
print(f'RMSE error of the SVR model on the grouped data is {rmse_val}')

R2 score of the SVR model on the grouped data is 0.877727899838903
RMSE error of the SVR model on the grouped data is 0.4722860270295667


### Save the model

In [11]:
import joblib
joblib.dump(svr_model, "svr_model.pkl") 

['svr_model.pkl']

#### Load JANUS data

In [8]:
with open('tests/DATA/sample_random_smiles.txt', 'r') as f:
     mols = [line.rstrip() for line in f]

for mol_str in mols:
     mol = Chem.MolFromSmiles(mol_str)
     fp = np.array([AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048).ToList()])
     print(svr_model.predict(fp))

[5.84074358]
[5.84223862]
[5.82379004]
[5.83992948]
[5.84235675]
[5.84214651]
[5.83632173]
[5.84258505]
[5.83065047]
[5.84231323]
[5.84233746]
[5.82305397]
[5.84227631]
[5.84305599]
[5.83442631]
[5.84543696]
[5.84273944]
[5.83101274]
[5.83104748]
[5.84201203]
[5.84235674]
[5.84124256]
[5.84229137]
[5.81234071]
[5.84226334]
[5.84230214]
[5.84188213]
[5.86239801]
[5.84230773]
[5.84177171]
[5.7864907]
[5.84192861]
[5.84241796]
[5.84226909]
[5.86222114]
[5.84248147]
[5.84514642]
[5.84121852]
[5.84214672]
[5.84256852]
[5.84265127]
[5.84455514]
[5.83330048]
[5.83346537]
[5.84290921]
[5.81983761]
[5.84186652]
[5.84606606]
[5.84228381]
[5.83996708]
[5.84430805]
[5.84269361]
[5.86020422]
[5.84129615]
[5.84361987]
[5.84466225]
[5.84328787]
[5.84242899]
[5.83936587]
[5.83628119]
[5.84238759]
[5.80982431]
[5.85036843]
[5.84569564]
[5.8424759]
[5.84232594]
[5.84392664]
[5.84322514]
[5.78074515]
[5.84164054]
[5.84188898]
[5.83612786]
[5.84249479]
[5.83831371]
[5.84039716]
[5.84244394]
[5.84312018]
[