In [7]:
import numpy as np
import pandas as pd
from rdkit import Chem
from skfp.fingerprints import PubChemFingerprint
from sklearn.ensemble import RandomForestClassifier
import shap
import pickle
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt


In [3]:
smiles_df = pd.read_csv('smiles.csv', header=None)
smiles_list = smiles_df.iloc[:, 0].tolist()

Y_hlgt = pd.read_csv('/home/maciej/studia/praktyki_ibb/hlgt_binary.csv').to_numpy()

smiles_list_clean = []
idx_clean = []
for i, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        smiles_list_clean.append(smi)
        idx_clean.append(i)
Y_hlgt_clean = Y_hlgt[idx_clean, :]


In [4]:
fp = PubChemFingerprint(n_jobs=2, sparse=False, count=True)
X = fp.fit_transform(smiles_list_clean)
if hasattr(X, "toarray"):
    X = X.toarray()


In [5]:
model_path = 'models/PubChem_RandomForest_HLT.pkl'
with open(model_path, 'rb') as f:
    model_entry = pickle.load(f)
model = model_entry['model']
X_train = model_entry['X_train_umap']
X_test = model_entry['X_test_umap']


In [14]:
rng = np.random.default_rng(42)
background = X_train[rng.choice(X_train.shape[0], 100, replace=False)]
explainer = shap.TreeExplainer(model, data=background)

X_shap = X_test[:50] 
shap_values = explainer.shap_values(X_shap, desc="shap running")



shap_results = {
    'fingerprint': 'PubChem',
    'model': 'RandomForest',
    'level': 'HLGT',
    'shap_values': shap_values,
    'X_shap': X_shap
}
os.makedirs('shaps', exist_ok=True)
with open('shaps/shap_rf_pubchem_hlt.pkl', 'wb') as f:
    pickle.dump(shap_results, f)

print("SHAP ready.")




SHAP ready.


In [None]:
import pickle
import shap


with open('shaps/shap_rf_pubchem_hlgt.pkl', 'rb') as f:
    shap_results = pickle.load(f)

X_shap = shap_results['X_shap']
shap_values = shap_results['shap_values']


print("X_shap shape:", X_shap.shape)
print("shap_values shape:", shap_values.shape) 


label_idx = 0

shap_for_label = shap_values[:, :, label_idx]  


shap.summary_plot(
    shap_for_label,
    X_shap,
    max_display=10
)


In [None]:
import matplotlib.pyplot as plt

label_idx = 0  
shap_for_label = shap_values[:, :, label_idx]
shap.summary_plot(shap_for_label, X_shap, max_display=10)
plt.show()


In [None]:
import pickle
import shap
import matplotlib.pyplot as plt
%matplotlib inline
with open('shaps/shap_rf_pubchem_hlt.pkl', 'rb') as f:
    shap_results = pickle.load(f)

X_shap = shap_results['X_shap']
shap_values = shap_results['shap_values']

label_idx = 0  
shap_for_label = shap_values[:, :, label_idx]

shap.summary_plot(shap_for_label, X_shap, plot_type='bar',max_display=10)
plt.show()  


In [None]:
shap.summary_plot(shap_for_label, X_shap, plot_type='bar', max_display=100, show=False)
import matplotlib.pyplot as plt
plt.savefig('shap_figs/shap_rf_pubchem_hlt.png', dpi=200,bbox_inches='tight')
plt.show()


In [18]:
model_path = 'models/MACCS_RandomForest_HLT.pkl'
with open(model_path, 'rb') as f:
    model_entry = pickle.load(f)
model = model_entry['model']
X_train = model_entry['X_train_umap']
X_test = model_entry['X_test_umap']


In [19]:
rng = np.random.default_rng(42)
background = X_train[rng.choice(X_train.shape[0], 100, replace=False)]
explainer = shap.TreeExplainer(model, data=background)

X_shap = X_test[:50]  
shap_values = explainer.shap_values(X_shap)



shap_results = {
    'fingerprint': 'MACCS',
    'model': 'RandomForest',
    'level': 'HLT',
    'shap_values': shap_values,
    'X_shap': X_shap
}
os.makedirs('shaps', exist_ok=True)
with open('shaps/shap_rf_maccs_hlt.pkl', 'wb') as f:
    pickle.dump(shap_results, f)

print("SHAP ready.")




SHAP ready.


In [None]:
import pickle
import shap


with open('shaps/shap_rf_maccs_hlt.pkl', 'rb') as f:
    shap_results = pickle.load(f)

X_shap = shap_results['X_shap']
shap_values = shap_results['shap_values']


print("X_shap shape:", X_shap.shape)
print("shap_values shape:", shap_values.shape) 


label_idx = 0

shap_for_label = shap_values[:, :, label_idx]  


shap.summary_plot(
    shap_for_label,
    X_shap,
    plot_type='bar',
    max_display=100,
    show=False
)
plt.savefig('shap_figs/shap_rf_maccs_hlt.png', bbox_inches='tight', dpi=200)
plt.show()



In [None]:
model_path = 'models/Avalon_RandomForest_HLT.pkl'
with open(model_path, 'rb') as f:
    model_entry = pickle.load(f)
model = model_entry['model']
X_train = model_entry['X_train_umap']
X_test = model_entry['X_test_umap']


rng = np.random.default_rng(42)
background = X_train[rng.choice(X_train.shape[0], 100, replace=False)]
explainer = shap.TreeExplainer(model, data=background)

X_shap = X_test[:50]  
shap_values = explainer.shap_values(X_shap, desc="shap running")



shap_results = {
    'fingerprint': 'Avalon',
    'model': 'RandomForest',
    'level': 'HLT',
    'shap_values': shap_values,
    'X_shap': X_shap
}
os.makedirs('shaps', exist_ok=True)
with open('shaps/shap_rf_avalon_hlt.pkl', 'wb') as f:
    pickle.dump(shap_results, f)

print("SHAP ready.")


import pickle
import shap


with open('shaps/shap_rf_avalon_hlt.pkl', 'rb') as f:
    shap_results = pickle.load(f)

X_shap = shap_results['X_shap']
shap_values = shap_results['shap_values']


print("X_shap shape:", X_shap.shape)
print("shap_values shape:", shap_values.shape) 


label_idx = 0

shap_for_label = shap_values[:, :, label_idx]  


shap.summary_plot(
    shap_for_label,
    X_shap,
    plot_type='bar',
    max_display=100,
    show=False
)
plt.savefig('shap_figs/shap_rf_avalon_hlt.png', bbox_inches='tight', dpi=200)
plt.show()



In [None]:
model_path = 'models/Avalon_RandomForest_HLGT.pkl'
with open(model_path, 'rb') as f:
    model_entry = pickle.load(f)
model = model_entry['model']
X_train = model_entry['X_train_umap']
X_test = model_entry['X_test_umap']


rng = np.random.default_rng(42)
background = X_train[rng.choice(X_train.shape[0], 100, replace=False)]
explainer = shap.TreeExplainer(model, data=background)

X_shap = X_test[:50]  
shap_values = explainer.shap_values(X_shap)



shap_results = {
    'fingerprint': 'Avalon',
    'model': 'RandomForest',
    'level': 'HLGT',
    'shap_values': shap_values,
    'X_shap': X_shap
}
os.makedirs('shaps', exist_ok=True)
with open('shaps/shap_rf_avalon_hlgt.pkl', 'wb') as f:
    pickle.dump(shap_results, f)

print("SHAP ready.")


import pickle
import shap


with open('shaps/shap_rf_avalon_hlgt.pkl', 'rb') as f:
    shap_results = pickle.load(f)

X_shap = shap_results['X_shap']
shap_values = shap_results['shap_values']


print("X_shap shape:", X_shap.shape)
print("shap_values shape:", shap_values.shape) 


label_idx = 0

shap_for_label = shap_values[:, :, label_idx]  


shap.summary_plot(
    shap_for_label,
    X_shap,
    plot_type='bar',
    max_display=100,
    show=False
)
plt.savefig('shap_figs/shap_rf_avalon_hlgt.png', bbox_inches='tight', dpi=200)
plt.show()



In [None]:
model_path = 'models/AtomPair_RandomForest_HLGT.pkl'
with open(model_path, 'rb') as f:
    model_entry = pickle.load(f)
model = model_entry['model']
X_train = model_entry['X_train_umap']
X_test = model_entry['X_test_umap']


rng = np.random.default_rng(42)
background = X_train[rng.choice(X_train.shape[0], 100, replace=False)]
explainer = shap.TreeExplainer(model, data=background)

X_shap = X_test[:50]  
shap_values = explainer.shap_values(X_shap)



shap_results = {
    'fingerprint': 'AtomPair',
    'model': 'RandomForest',
    'level': 'HLGT',
    'shap_values': shap_values,
    'X_shap': X_shap
}
os.makedirs('shaps', exist_ok=True)
with open('shaps/shap_rf_atompair_hlgt.pkl', 'wb') as f:
    pickle.dump(shap_results, f)

print("SHAP ready.")


import pickle
import shap


with open('shaps/shap_rf_atompair_hlgt.pkl', 'rb') as f:
    shap_results = pickle.load(f)

X_shap = shap_results['X_shap']
shap_values = shap_results['shap_values']


print("X_shap shape:", X_shap.shape)
print("shap_values shape:", shap_values.shape) 


label_idx = 0

shap_for_label = shap_values[:, :, label_idx]  


shap.summary_plot(
    shap_for_label,
    X_shap,
    plot_type='bar',
    max_display=100,
    show=False
)
plt.savefig('shap_figs/shap_rf_atompair_hlgt.png', bbox_inches='tight', dpi=200)
plt.show()

