In [1]:
# %% [markdown]
# # üéØ 5. Selectivity Analysis (AI-Based)
# **Goal:** Check if our Top 10 candidates kill the Cancer Target (CA IX) 
# but spare the Healthy Target (CA II).

import pandas as pd
import numpy as np
import joblib
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
# 1. Load Resources
model = joblib.load('../artifacts/model.pkl')
preprocessor = joblib.load('../artifacts/preprocessor.pkl')
df_train = pd.read_csv('../artifacts/data/clean_training_data.csv')
top_candidates = pd.read_csv('../artifacts/top_candidates_for_docking.csv')

In [3]:
# 2. Get Protein Features for the "Off-Target" (CA II)
# CA II is found in red blood cells; blocking it causes side effects.
try:
    off_target_features = df_train[df_train['target_name'].str.contains("CA II")].iloc[0][
        ['Protein_Weight', 'Protein_Aromaticity', 'Protein_Isoelectric', 'Protein_Hydrophobicity']
    ]
    print("‚úÖ Found features for Off-Target: Carbonic Anhydrase II")
except:
    print("‚ö†Ô∏è CA II features not found. Using average non-target features.")
    # Fallback logic would go here

‚úÖ Found features for Off-Target: Carbonic Anhydrase II


In [4]:
# 3. Prepare the "Off-Target" Simulation Data
# We take the SAME drug candidates, but assign them the WRONG protein features
df_off_target = top_candidates.copy()
for col in off_target_features.index:
    df_off_target[col] = off_target_features[col]

In [5]:
# 4. Predict Potency against Off-Target
def get_fingerprints(smiles_list):
    fps = []
    valid_idx = []
    for i, smiles in enumerate(smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fps.append(np.array(fp))
        valid_idx.append(i)
    return np.array(fps), valid_idx

X_fps, _ = get_fingerprints(df_off_target['clean_smiles'])
X_fps_df = pd.DataFrame(X_fps, columns=[f'fp_{i}' for i in range(2048)])
X_protein = df_off_target[off_target_features.index].reset_index(drop=True)
X_final = pd.concat([X_fps_df, X_protein], axis=1)

# Transform & Predict
X_proc = preprocessor.transform(X_final)
df_off_target['Predicted_pIC50_CA_II'] = model.predict(X_proc)



In [6]:
# 5. Calculate Selectivity Index (SI)
# SI = Potency(Target) - Potency(Off-Target)
# Positive Score = Selective for Cancer (GOOD)
# Negative Score = Kills Healthy Cells (BAD)

results = top_candidates[['clean_smiles', 'Predicted_pIC50']].copy()
results.rename(columns={'Predicted_pIC50': 'Potency_CA_IX'}, inplace=True)
results['Potency_CA_II'] = df_off_target['Predicted_pIC50_CA_II']
results['Selectivity_Score'] = results['Potency_CA_IX'] - results['Potency_CA_II']

print("\nüèÜ SELECTIVITY RANKING (Higher is Safer)")
display(results.sort_values(by='Selectivity_Score', ascending=False))


üèÜ SELECTIVITY RANKING (Higher is Safer)


Unnamed: 0,clean_smiles,Potency_CA_IX,Potency_CA_II,Selectivity_Score
0,N=C(N)NCSCC(NC(=O)CN(CC1(c2ccccc2)CC1)C(=O)CCc...,9.539032,5.793742,3.74529
8,CC1=CC(C(=O)NC(CCCCN)C(=O)C(=O)NOCc2ccc(C(N)=O...,9.162757,5.986909,3.175849
7,CC1=CC(C(=O)NC(CCCCN)C(=O)C(=O)NSCc2ccc(C(N)=O...,9.169065,6.045223,3.123841
3,CC(C)(CN(CC(=O)NC(CCNN=C(N)N)B(O)O)C(=O)CCc1cc...,9.480746,6.420278,3.060468
1,CC1(N)C2CC3OB(C(CCCCN)NC(=O)C4CCCN4C(=O)c4cccc...,9.527605,6.743636,2.783969
5,CC1(C)C2CC3OB(C(CCCCN)NC(=O)C4CCCN4C(=O)c4cccc...,9.397865,6.654196,2.743669
2,CC1(F)C2CC3OB(C(CCCCN)NC(=O)C4CCCN4C(=O)c4cccc...,9.527605,6.815522,2.712083
4,CC1(C)C2CC3OB(C(CCCCN)NC(=O)C4CCCN4C(=O)c4cncc...,9.439089,6.73467,2.704419
6,CC1(C)C2CC3OB(C(CCCCN)NC(=O)C4CCCN4C(=O)c4cccc...,9.289439,6.79861,2.490829
9,Cc1ccc(NS(=O)(=O)Nc2ccccc2)c(=O)n1CC(=O)NCc1cc...,9.137539,8.390647,0.746892
