In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

import joblib

from sklearn.metrics import r2_score, mean_squared_error
import math

import numpy as np

In [2]:
def transform(target_val):
    transformed = []
    for element in target_val:
        transformed.append(-1.0 * np.log10(element/1000000000))
    return transformed

def inverse_transform(transformed_values):
    inversed = []
    for element in transformed_values:
        inversed.append(np.power(10,-element)*1000000000)
    return inversed

In [3]:
smiles_df = pd.read_excel('../Data/Kolchicyna_prepared_data.xlsx')

In [4]:
smiles_df.head()

Unnamed: 0.1,Unnamed: 0,Publication DOI,Number of compound in publication,SMILES,Activity [nM],A549,MCF-7,LoVo,LoVo/DX,BALB/3T3,A549_float,MCF-7_float,LoVo_float,LoVo/DX_float,BALB/3T3_float
0,0,https://doi.org/10.1016/j.bmcl.2021.128382,1,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,Activity [nM],108,103,65,549.0,102.0,10.8,10.3,6.5,54.9,10.2
1,1,https://doi.org/10.1016/j.bmcl.2021.128382,2,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,Activity [nM],116,120,85,31.1,14.3,11.6,12.0,8.5,31.1,14.3
2,2,https://doi.org/10.1016/j.bmcl.2021.128382,3,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,Activity [nM],109,122,88,179.0,117.0,10.9,12.2,8.8,17.9,11.7
3,3,https://doi.org/10.1016/j.bmcl.2021.128382,4,CC(C)CN[C@H]2CCc3cc(OC)c(OC)c(OC)c3C1=CC=C(SC)...,Activity [nM],105,113,85,102.0,110.0,10.5,11.3,8.5,10.2,11.0
4,4,https://doi.org/10.1016/j.bmcl.2021.128382,5,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,Activity [nM],895,927,528,778.0,994.0,89.5,92.7,52.8,77.8,99.4


In [5]:
molecular_descriptors = pd.read_excel('../Data/Kolchicyna_machine_learning.xlsx')

In [6]:
molecular_descriptors.head()

Unnamed: 0.1,Unnamed: 0,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0s,...,A549_float,MCF-7_float,LoVo_float,LoVo/DX_float,BALB/3T3_float,A549_float_transformed,MCF-7_float_transformed,LoVo_float_transformed,LoVo/DX_float_transformed,BALB/3T3_float_transformed
0,0,25.090909,6.046518,3.109091,6.662626,160.775045,99.075564,1.555512,6.12668,3.435416,...,10.8,10.3,6.5,54.9,10.2,7.966576,7.987163,8.187087,7.260428,7.9914
1,1,24.448276,6.008422,3.051724,6.386973,161.021674,96.473315,1.538471,6.088791,3.330998,...,11.6,12.0,8.5,31.1,14.3,7.935542,7.920819,8.070581,7.50724,7.844664
2,2,23.868852,5.974074,3.0,6.138434,161.244045,94.127025,1.523105,6.05463,3.23685,...,10.9,12.2,8.8,17.9,11.7,7.962574,7.91364,8.055517,7.747147,7.931814
3,3,23.868852,5.974074,3.032787,6.17122,161.244045,94.127025,1.523105,6.05463,3.257798,...,10.5,11.3,8.5,10.2,11.0,7.978811,7.946922,8.070581,7.9914,7.958607
4,4,23.34375,5.942945,2.953125,5.913194,161.445569,92.0007,1.50918,6.02367,3.151529,...,89.5,92.7,52.8,77.8,99.4,7.048177,7.03292,7.277366,7.10902,7.002614


In [7]:
frames = [smiles_df['SMILES'], molecular_descriptors]
res = pd.concat(frames, axis=1)
res = res.drop(['Unnamed: 0'], axis=1)

In [8]:
res

Unnamed: 0,SMILES,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0s,...,A549_float,MCF-7_float,LoVo_float,LoVo/DX_float,BALB/3T3_float,A549_float_transformed,MCF-7_float_transformed,LoVo_float_transformed,LoVo/DX_float_transformed,BALB/3T3_float_transformed
0,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,25.090909,6.046518,3.109091,6.662626,160.775045,99.075564,1.555512,6.126680,3.435416,...,10.8,10.3,6.5,54.9,10.2,7.966576,7.987163,8.187087,7.260428,7.991400
1,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,24.448276,6.008422,3.051724,6.386973,161.021674,96.473315,1.538471,6.088791,3.330998,...,11.6,12.0,8.5,31.1,14.3,7.935542,7.920819,8.070581,7.507240,7.844664
2,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,23.868852,5.974074,3.000000,6.138434,161.244045,94.127025,1.523105,6.054630,3.236850,...,10.9,12.2,8.8,17.9,11.7,7.962574,7.913640,8.055517,7.747147,7.931814
3,CC(C)CN[C@H]2CCc3cc(OC)c(OC)c(OC)c3C1=CC=C(SC)...,23.868852,5.974074,3.032787,6.171220,161.244045,94.127025,1.523105,6.054630,3.257798,...,10.5,11.3,8.5,10.2,11.0,7.978811,7.946922,8.070581,7.991400,7.958607
4,COc2c3C1=CC=C(SC)C(=O)C=C1[C@H](CCc3cc(OC)c2OC...,23.343750,5.942945,2.953125,5.913194,161.445569,92.000700,1.509180,6.023670,3.151529,...,89.5,92.7,52.8,77.8,99.4,7.048177,7.032920,7.277366,7.109020,7.002614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,FC(F)(F)c1ccc(cc1)NC(=S)N[C@H]3CCc4cc(OC)c(OC)...,28.925373,6.629527,3.492537,9.558872,165.991839,116.283088,1.583312,6.669057,6.199313,...,86.0,120.0,54.0,2700.0,67.0,7.065502,6.920819,7.267606,5.568636,7.173925
116,O[C@@H](CNC(=S)N[C@H]2CCc3cc(OC)c(OC)c(OC)c3C1...,24.307692,6.247564,3.241758,7.235653,163.907908,95.896512,1.427144,6.299627,3.659034,...,70.0,15.0,32.0,5300.0,440.0,7.154902,7.823909,7.494850,5.275724,6.356547
117,OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)CNC(=S)N[...,25.818182,6.393614,3.103896,7.512266,164.617043,101.985317,1.423114,6.435469,5.205806,...,930.0,1200.0,680.0,6600.0,1300.0,6.031517,5.920819,6.167491,5.180456,5.886057
118,CC(C)(C)OC(=O)\N=C(/NC(=O)OC(C)(C)C)N[C@H]2CCc...,21.952941,6.267407,3.105882,7.623529,165.008670,86.439890,1.354297,6.311679,4.432190,...,850.0,940.0,500.0,5900.0,690.0,6.070581,6.026872,6.301030,5.229148,6.161151


In [9]:
## load models
model_A549 = joblib.load('Random_forest/random_forest_model_17_estimators_A549.joblib') #Random state 15
model_BALB3_t3 = joblib.load('Random_forest/random_forest_model_19_estimators_BALB_3T3.joblib') #Random state 42
model_LoVo_DX = joblib.load('Random_forest/random_forest_model_14_estimators_LoVo_DX.joblib') #Random state 28
model_LoVo = joblib.load('Random_forest/random_forest_model_18_estimators_LoVo.joblib') #Random state 42
model_MCF_7 = joblib.load('Random_forest/random_forest_model_3_estimators_MCF-7.joblib') #Random state 15

# Construct a dataframe for each Random Seed and predict values, additionally calculate R score and RMSE...

## A549

In [10]:
list(model_A549.feature_names_in_)

['AMID_O', 'EState_VSA5', 'MDEO-12', 'SaasC', 'VSA_EState5']

In [11]:
res.columns[0:-9]

Index(['SMILES', 'AATS0Z', 'AATS0are', 'AATS0d', 'AATS0dv', 'AATS0i', 'AATS0m',
       'AATS0p', 'AATS0pe', 'AATS0s',
       ...
       'piPC10', 'piPC2', 'piPC3', 'piPC4', 'piPC5', 'piPC6', 'piPC7', 'piPC8',
       'piPC9', 'A549_float'],
      dtype='object', length=1213)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-9]], 
                                                        res['A549_float_transformed'], 
                                                        test_size=0.15, random_state=15)

X_train['predicted A549 transformed IC50'] = model_A549.predict(X_train[list(model_A549.feature_names_in_)])
X_train['observed A549 transformed IC50'] = y_train
X_train['predicted A549 IC50'] = inverse_transform(X_train['predicted A549 transformed IC50'])
X_test['predicted A549 transformed IC50'] = model_A549.predict(X_test[list(model_A549.feature_names_in_)])
X_test['observed A549 transformed IC50'] = y_test
X_test['predicted A549 IC50'] = inverse_transform(X_test['predicted A549 transformed IC50'])

print('Train R score: '+str(math.sqrt(r2_score(X_train['observed A549 transformed IC50'], X_train['predicted A549 transformed IC50']))))
print('Test R score: '+str(math.sqrt(r2_score(X_test['observed A549 transformed IC50'], X_test['predicted A549 transformed IC50']))))
print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed A549 transformed IC50'], X_train['predicted A549 transformed IC50']))))
print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed A549 transformed IC50'], X_test['predicted A549 transformed IC50']))))

Train R score: 0.960591935079034
Test R score: 0.8620618597285548
Train RMSE score: 0.2594924364766403
Test RMSE score: 0.5253556041537444


In [13]:
with pd.ExcelWriter("../Data/A549_random_state_15.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-9]], 
                                                        res['A549_float_transformed'], 
                                                        test_size=0.15, random_state=28)

X_train['predicted A549 transformed IC50'] = model_A549.predict(X_train[list(model_A549.feature_names_in_)])
X_train['observed A549 transformed IC50'] = y_train
X_train['predicted A549 IC50'] = inverse_transform(X_train['predicted A549 transformed IC50'])
X_test['predicted A549 transformed IC50'] = model_A549.predict(X_test[list(model_A549.feature_names_in_)])
X_test['observed A549 transformed IC50'] = y_test
X_test['predicted A549 IC50'] = inverse_transform(X_test['predicted A549 transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed A549 transformed IC50'], X_train['predicted A549 transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed A549 transformed IC50'], X_test['predicted A549 transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed A549 transformed IC50'], X_train['predicted A549 transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed A549 transformed IC50'], X_test['predicted A549 transformed IC50']))))

In [15]:
with pd.ExcelWriter("../Data/A549_random_state_28.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-9]], 
                                                        res['A549_float_transformed'], 
                                                        test_size=0.15, random_state=42)

X_train['predicted A549 transformed IC50'] = model_A549.predict(X_train[list(model_A549.feature_names_in_)])
X_train['observed A549 transformed IC50'] = y_train
X_train['predicted A549 IC50'] = inverse_transform(X_train['predicted A549 transformed IC50'])
X_test['predicted A549 transformed IC50'] = model_A549.predict(X_test[list(model_A549.feature_names_in_)])
X_test['observed A549 transformed IC50'] = y_test
X_test['predicted A549 IC50'] = inverse_transform(X_test['predicted A549 transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed A549 transformed IC50'], X_train['predicted A549 transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed A549 transformed IC50'], X_test['predicted A549 transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed A549 transformed IC50'], X_train['predicted A549 transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed A549 transformed IC50'], X_test['predicted A549 transformed IC50']))))

In [17]:
with pd.ExcelWriter("../Data/A549_random_state_42.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

## BALB 3T3

In [18]:
list(model_BALB3_t3.feature_names_in_)

['AMID_O', 'EState_VSA5', 'GATS2c', 'MDEO-12', 'NdssC', 'VSA_EState5']

In [19]:
res.columns[0:-5]

Index(['SMILES', 'AATS0Z', 'AATS0are', 'AATS0d', 'AATS0dv', 'AATS0i', 'AATS0m',
       'AATS0p', 'AATS0pe', 'AATS0s',
       ...
       'piPC5', 'piPC6', 'piPC7', 'piPC8', 'piPC9', 'A549_float',
       'MCF-7_float', 'LoVo_float', 'LoVo/DX_float', 'BALB/3T3_float'],
      dtype='object', length=1217)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-5]], 
                                                        res['BALB/3T3_float_transformed'], 
                                                        test_size=0.15, random_state=15)

X_train['predicted BALB 3T3 transformed IC50'] = model_BALB3_t3.predict(X_train[list(model_BALB3_t3.feature_names_in_)])
X_train['observed BALB 3T3 transformed IC50'] = y_train
X_train['predicted BALB 3T3 IC50'] = inverse_transform(X_train['predicted BALB 3T3 transformed IC50'])
X_test['predicted BALB 3T3 transformed IC50'] = model_BALB3_t3.predict(X_test[list(model_BALB3_t3.feature_names_in_)])
X_test['observed BALB 3T3 transformed IC50'] = y_test
X_test['predicted BALB 3T3 IC50'] = inverse_transform(X_test['predicted BALB 3T3 transformed IC50'])

print('Train R score: '+str(math.sqrt(r2_score(X_train['observed BALB 3T3 transformed IC50'], X_train['predicted BALB 3T3 transformed IC50']))))
print('Test R score: '+str(math.sqrt(r2_score(X_test['observed BALB 3T3 transformed IC50'], X_test['predicted BALB 3T3 transformed IC50']))))
print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed BALB 3T3 transformed IC50'], X_train['predicted BALB 3T3 transformed IC50']))))
print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed BALB 3T3 transformed IC50'], X_test['predicted BALB 3T3 transformed IC50']))))

Train R score: 0.9656538230216772
Test R score: 0.8642992780332693
Train RMSE score: 0.22547718186980922
Test RMSE score: 0.46260702873007903


In [21]:
with pd.ExcelWriter("../Data/BALB_3T3_random_state_15.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-5]], 
                                                        res['BALB/3T3_float_transformed'], 
                                                        test_size=0.15, random_state=28)

X_train['predicted BALB 3T3 transformed IC50'] = model_BALB3_t3.predict(X_train[list(model_BALB3_t3.feature_names_in_)])
X_train['observed BALB 3T3 transformed IC50'] = y_train
X_train['predicted BALB 3T3 IC50'] = inverse_transform(X_train['predicted BALB 3T3 transformed IC50'])
X_test['predicted BALB 3T3 transformed IC50'] = model_BALB3_t3.predict(X_test[list(model_BALB3_t3.feature_names_in_)])
X_test['observed BALB 3T3 transformed IC50'] = y_test
X_test['predicted BALB 3T3 IC50'] = inverse_transform(X_test['predicted BALB 3T3 transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed BALB 3T3 transformed IC50'], X_train['predicted BALB 3T3 transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed BALB 3T3 transformed IC50'], X_test['predicted BALB 3T3 transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed BALB 3T3 transformed IC50'], X_train['predicted BALB 3T3 transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed BALB 3T3 transformed IC50'], X_test['predicted BALB 3T3 transformed IC50']))))

In [23]:
with pd.ExcelWriter("../Data/BALB_3T3_random_state_28.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-5]], 
                                                        res['BALB/3T3_float_transformed'], 
                                                        test_size=0.15, random_state=42)

X_train['predicted BALB 3T3 transformed IC50'] = model_BALB3_t3.predict(X_train[list(model_BALB3_t3.feature_names_in_)])
X_train['observed BALB 3T3 transformed IC50'] = y_train
X_train['predicted BALB 3T3 IC50'] = inverse_transform(X_train['predicted BALB 3T3 transformed IC50'])
X_test['predicted BALB 3T3 transformed IC50'] = model_BALB3_t3.predict(X_test[list(model_BALB3_t3.feature_names_in_)])
X_test['observed BALB 3T3 transformed IC50'] = y_test
X_test['predicted BALB 3T3 IC50'] = inverse_transform(X_test['predicted BALB 3T3 transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed BALB 3T3 transformed IC50'], X_train['predicted BALB 3T3 transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed BALB 3T3 transformed IC50'], X_test['predicted BALB 3T3 transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed BALB 3T3 transformed IC50'], X_train['predicted BALB 3T3 transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed BALB 3T3 transformed IC50'], X_test['predicted BALB 3T3 transformed IC50']))))

In [25]:
with pd.ExcelWriter("../Data/BALB_3T3_random_state_42.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

## LoVo/dx

In [26]:
list(model_LoVo_DX.feature_names_in_)

['GATS2c', 'MATS2c', 'NdssC', 'RNCG', 'TopoPSA(NO)']

In [27]:
res.columns[0:-6]

Index(['SMILES', 'AATS0Z', 'AATS0are', 'AATS0d', 'AATS0dv', 'AATS0i', 'AATS0m',
       'AATS0p', 'AATS0pe', 'AATS0s',
       ...
       'piPC4', 'piPC5', 'piPC6', 'piPC7', 'piPC8', 'piPC9', 'A549_float',
       'MCF-7_float', 'LoVo_float', 'LoVo/DX_float'],
      dtype='object', length=1216)

In [28]:
print(res.shape)
res_ = res.dropna()
res_.shape

(120, 1222)


(106, 1222)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(res_[res_.columns[0:-6]], 
                                                        res_['LoVo/DX_float_transformed'], 
                                                        test_size=0.15, random_state=15)

X_train['predicted LoVo/DX transformed IC50'] = model_LoVo_DX.predict(X_train[list(model_LoVo_DX.feature_names_in_)])
X_train['observed LoVo/DX transformed IC50'] = y_train
X_train['predicted LoVo/DX IC50'] = inverse_transform(X_train['predicted LoVo/DX transformed IC50'])
X_test['predicted LoVo/DX transformed IC50'] = model_LoVo_DX.predict(X_test[list(model_LoVo_DX.feature_names_in_)])
X_test['observed LoVo/DX transformed IC50'] = y_test
X_test['predicted LoVo/DX IC50'] = inverse_transform(X_test['predicted LoVo/DX transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed LoVo/DX transformed IC50'], X_train['predicted LoVo/DX transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed LoVo/DX transformed IC50'], X_test['predicted LoVo/DX transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed LoVo/DX transformed IC50'], X_train['predicted LoVo/DX transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed LoVo/DX transformed IC50'], X_test['predicted LoVo/DX transformed IC50']))))

In [30]:
with pd.ExcelWriter("../Data/LoVo_DX_random_state_15.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(res_[res_.columns[0:-6]], 
                                                        res_['LoVo/DX_float_transformed'], 
                                                        test_size=0.15, random_state=28)

X_train['predicted LoVo/DX transformed IC50'] = model_LoVo_DX.predict(X_train[list(model_LoVo_DX.feature_names_in_)])
X_train['observed LoVo/DX transformed IC50'] = y_train
X_train['predicted LoVo/DX IC50'] = inverse_transform(X_train['predicted LoVo/DX transformed IC50'])
X_test['predicted LoVo/DX transformed IC50'] = model_LoVo_DX.predict(X_test[list(model_LoVo_DX.feature_names_in_)])
X_test['observed LoVo/DX transformed IC50'] = y_test
X_test['predicted LoVo/DX IC50'] = inverse_transform(X_test['predicted LoVo/DX transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed LoVo/DX transformed IC50'], X_train['predicted LoVo/DX transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed LoVo/DX transformed IC50'], X_test['predicted LoVo/DX transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed LoVo/DX transformed IC50'], X_train['predicted LoVo/DX transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed LoVo/DX transformed IC50'], X_test['predicted LoVo/DX transformed IC50']))))

In [32]:
with pd.ExcelWriter("../Data/LoVo_DX_random_state_28.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(res_[res_.columns[0:-6]], 
                                                        res_['LoVo/DX_float_transformed'], 
                                                        test_size=0.15, random_state=42)

X_train['predicted LoVo/DX transformed IC50'] = model_LoVo_DX.predict(X_train[list(model_LoVo_DX.feature_names_in_)])
X_train['observed LoVo/DX transformed IC50'] = y_train
X_train['predicted LoVo/DX IC50'] = inverse_transform(X_train['predicted LoVo/DX transformed IC50'])
X_test['predicted LoVo/DX transformed IC50'] = model_LoVo_DX.predict(X_test[list(model_LoVo_DX.feature_names_in_)])
X_test['observed LoVo/DX transformed IC50'] = y_test
X_test['predicted LoVo/DX IC50'] = inverse_transform(X_test['predicted LoVo/DX transformed IC50'])

print('Train R score: '+str(math.sqrt(r2_score(X_train['observed LoVo/DX transformed IC50'], X_train['predicted LoVo/DX transformed IC50']))))
print('Test R score: '+str(math.sqrt(r2_score(X_test['observed LoVo/DX transformed IC50'], X_test['predicted LoVo/DX transformed IC50']))))
print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed LoVo/DX transformed IC50'], X_train['predicted LoVo/DX transformed IC50']))))
print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed LoVo/DX transformed IC50'], X_test['predicted LoVo/DX transformed IC50']))))

Train R score: 0.9610074984048745
Test R score: 0.790562123545645
Train RMSE score: 0.26590821262128245
Test RMSE score: 0.4420939409154765


In [34]:
with pd.ExcelWriter("../Data/LoVo_DX_random_state_42.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

## LoVo

In [35]:
list(model_LoVo.feature_names_in_)

['EState_VSA5', 'MDEO-12']

In [36]:
res.columns[0:-7]

Index(['SMILES', 'AATS0Z', 'AATS0are', 'AATS0d', 'AATS0dv', 'AATS0i', 'AATS0m',
       'AATS0p', 'AATS0pe', 'AATS0s',
       ...
       'piPC3', 'piPC4', 'piPC5', 'piPC6', 'piPC7', 'piPC8', 'piPC9',
       'A549_float', 'MCF-7_float', 'LoVo_float'],
      dtype='object', length=1215)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-7]], 
                                                        res['LoVo_float_transformed'], 
                                                        test_size=0.15, random_state=15)

X_train['predicted LoVo transformed IC50'] = model_LoVo.predict(X_train[list(model_LoVo.feature_names_in_)])
X_train['observed LoVo transformed IC50'] = y_train
X_train['predicted LoVo IC50'] = inverse_transform(X_train['predicted LoVo transformed IC50'])
X_test['predicted LoVo transformed IC50'] = model_LoVo.predict(X_test[list(model_LoVo.feature_names_in_)])
X_test['observed LoVo transformed IC50'] = y_test
X_test['predicted LoVo IC50'] = inverse_transform(X_test['predicted LoVo transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed LoVo transformed IC50'], X_train['predicted LoVo transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed LoVo transformed IC50'], X_test['predicted LoVo transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed LoVo transformed IC50'], X_train['predicted LoVo transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed LoVo transformed IC50'], X_test['predicted LoVo transformed IC50']))))

In [38]:
with pd.ExcelWriter("../Data/LoVo_random_state_15.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-7]], 
                                                        res['LoVo_float_transformed'], 
                                                        test_size=0.15, random_state=28)

X_train['predicted LoVo transformed IC50'] = model_LoVo.predict(X_train[list(model_LoVo.feature_names_in_)])
X_train['observed LoVo transformed IC50'] = y_train
X_train['predicted LoVo IC50'] = inverse_transform(X_train['predicted LoVo transformed IC50'])
X_test['predicted LoVo transformed IC50'] = model_LoVo.predict(X_test[list(model_LoVo.feature_names_in_)])
X_test['observed LoVo transformed IC50'] = y_test
X_test['predicted LoVo IC50'] = inverse_transform(X_test['predicted LoVo transformed IC50'])

print('Train R score: '+str(math.sqrt(r2_score(X_train['observed LoVo transformed IC50'], X_train['predicted LoVo transformed IC50']))))
print('Test R score: '+str(math.sqrt(r2_score(X_test['observed LoVo transformed IC50'], X_test['predicted LoVo transformed IC50']))))
print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed LoVo transformed IC50'], X_train['predicted LoVo transformed IC50']))))
print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed LoVo transformed IC50'], X_test['predicted LoVo transformed IC50']))))

Train R score: 0.8811267812945673
Test R score: 0.6572630628008767
Train RMSE score: 0.4438384819944741
Test RMSE score: 0.5047746604903675


In [40]:
with pd.ExcelWriter("../Data/LoVo_random_state_28.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-7]], 
                                                        res['LoVo_float_transformed'], 
                                                        test_size=0.15, random_state=42)

X_train['predicted LoVo transformed IC50'] = model_LoVo.predict(X_train[list(model_LoVo.feature_names_in_)])
X_train['observed LoVo transformed IC50'] = y_train
X_train['predicted LoVo IC50'] = inverse_transform(X_train['predicted LoVo transformed IC50'])
X_test['predicted LoVo transformed IC50'] = model_LoVo.predict(X_test[list(model_LoVo.feature_names_in_)])
X_test['observed LoVo transformed IC50'] = y_test
X_test['predicted LoVo IC50'] = inverse_transform(X_test['predicted LoVo transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed LoVo transformed IC50'], X_train['predicted LoVo transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed LoVo transformed IC50'], X_test['predicted LoVo transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed LoVo transformed IC50'], X_train['predicted LoVo transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed LoVo transformed IC50'], X_test['predicted LoVo transformed IC50']))))

In [42]:
with pd.ExcelWriter("../Data/LoVo_random_state_42.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

## MCF-7

In [43]:
list(model_MCF_7.feature_names_in_)

['AMID_O', 'EState_VSA5', 'EState_VSA6', 'MDEO-12']

In [44]:
res.columns[0:-8]

Index(['SMILES', 'AATS0Z', 'AATS0are', 'AATS0d', 'AATS0dv', 'AATS0i', 'AATS0m',
       'AATS0p', 'AATS0pe', 'AATS0s',
       ...
       'piPC2', 'piPC3', 'piPC4', 'piPC5', 'piPC6', 'piPC7', 'piPC8', 'piPC9',
       'A549_float', 'MCF-7_float'],
      dtype='object', length=1214)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-8]], 
                                                        res['LoVo_float_transformed'], 
                                                        test_size=0.15, random_state=15)

X_train['predicted MCF-7 transformed IC50'] = model_MCF_7.predict(X_train[list(model_MCF_7.feature_names_in_)])
X_train['observed MCF-7 transformed IC50'] = y_train
X_train['predicted MCF-7 IC50'] = inverse_transform(X_train['predicted MCF-7 transformed IC50'])
X_test['predicted MCF-7 transformed IC50'] = model_MCF_7.predict(X_test[list(model_MCF_7.feature_names_in_)])
X_test['observed MCF-7 transformed IC50'] = y_test
X_test['predicted MCF-7 IC50'] = inverse_transform(X_test['predicted MCF-7 transformed IC50'])

print('Train R score: '+str(math.sqrt(r2_score(X_train['observed MCF-7 transformed IC50'], X_train['predicted MCF-7 transformed IC50']))))
print('Test R score: '+str(math.sqrt(r2_score(X_test['observed MCF-7 transformed IC50'], X_test['predicted MCF-7 transformed IC50']))))
print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed MCF-7 transformed IC50'], X_train['predicted MCF-7 transformed IC50']))))
print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed MCF-7 transformed IC50'], X_test['predicted MCF-7 transformed IC50']))))

Train R score: 0.85288466325432
Test R score: 0.677226040545383
Train RMSE score: 0.4524890646893892
Test RMSE score: 0.8054094630462401


In [46]:
with pd.ExcelWriter("../Data/MCF-7_random_state_15.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-8]], 
                                                        res['LoVo_float_transformed'], 
                                                        test_size=0.15, random_state=28)

X_train['predicted MCF-7 transformed IC50'] = model_MCF_7.predict(X_train[list(model_MCF_7.feature_names_in_)])
X_train['observed MCF-7 transformed IC50'] = y_train
X_train['predicted MCF-7 IC50'] = inverse_transform(X_train['predicted MCF-7 transformed IC50'])
X_test['predicted MCF-7 transformed IC50'] = model_MCF_7.predict(X_test[list(model_MCF_7.feature_names_in_)])
X_test['observed MCF-7 transformed IC50'] = y_test
X_test['predicted MCF-7 IC50'] = inverse_transform(X_test['predicted MCF-7 transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed MCF-7 transformed IC50'], X_train['predicted MCF-7 transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed MCF-7 transformed IC50'], X_test['predicted MCF-7 transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed MCF-7 transformed IC50'], X_train['predicted MCF-7 transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed MCF-7 transformed IC50'], X_test['predicted MCF-7 transformed IC50']))))

In [48]:
with pd.ExcelWriter("../Data/MCF-7_random_state_28.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(res[res.columns[0:-8]], 
                                                        res['LoVo_float_transformed'], 
                                                        test_size=0.15, random_state=42)

X_train['predicted MCF-7 transformed IC50'] = model_MCF_7.predict(X_train[list(model_MCF_7.feature_names_in_)])
X_train['observed MCF-7 transformed IC50'] = y_train
X_train['predicted MCF-7 IC50'] = inverse_transform(X_train['predicted MCF-7 transformed IC50'])
X_test['predicted MCF-7 transformed IC50'] = model_MCF_7.predict(X_test[list(model_MCF_7.feature_names_in_)])
X_test['observed MCF-7 transformed IC50'] = y_test
X_test['predicted MCF-7 IC50'] = inverse_transform(X_test['predicted MCF-7 transformed IC50'])

#print('Train R score: '+str(math.sqrt(r2_score(X_train['observed MCF-7 transformed IC50'], X_train['predicted MCF-7 transformed IC50']))))
#print('Test R score: '+str(math.sqrt(r2_score(X_test['observed MCF-7 transformed IC50'], X_test['predicted MCF-7 transformed IC50']))))
#print('Train RMSE score: '+str(math.sqrt(mean_squared_error(X_train['observed MCF-7 transformed IC50'], X_train['predicted MCF-7 transformed IC50']))))
#print('Test RMSE score: '+str(math.sqrt(mean_squared_error(X_test['observed MCF-7 transformed IC50'], X_test['predicted MCF-7 transformed IC50']))))

In [50]:
with pd.ExcelWriter("../Data/MCF-7_random_state_42.xlsx") as writer:
    X_train.to_excel(writer, sheet_name="Train", index=True)
    X_test.to_excel(writer, sheet_name="Test", index=True)