<a href="https://colab.research.google.com/github/Quratulain-12/Bioinformatic-services/blob/main/2_Drug_Toxicity_Predictor_Colab_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Drug Toxicity Predictor
# Machine Learning model for toxicity prediction
# @title Drug Toxicity Prediction Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns

print("💊 DRUG TOXICITY PREDICTION MODEL")

# @markdown Enter SMILES strings (comma separated):
smiles_input = "CN1C=NC2=C1C(=O)N(C(=O)N2C)C, CC(=O)OC1=CC=CC=C1C(=O)O" # @param {type:"string"}

# @markdown Select model parameters:
n_estimators = 100 # @param {type:"slider", min:10, max:200, step:10}
test_size = 0.2 # @param {type:"slider", min:0.1, max:0.5, step:0.05}

# Load dataset (using sample Tox21 dataset)
# In practice, you'd use ChEMBL or PubChem data
!wget -q https://raw.githubusercontent.com/chemplexity/challenges/master/nih.csv
df = pd.read_csv('nih.csv')

# Preprocessing
X = df.drop(['smiles', 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
             'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE',
             'SR-MMP', 'SR-p53'], axis=1)
y = df['SR-ARE']  # Stress response pathway indicator

# Train model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

model = RandomForestClassifier(n_estimators=n_estimators)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"📈 Model Accuracy: {accuracy:.2%}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Non-Toxic', 'Toxic'],
            yticklabels=['Non-Toxic', 'Toxic'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Prediction for user input
def smiles_to_features(smiles):
    """Simplified feature extraction - real projects use RDKit"""
    return [len(smiles), smiles.count('O'), smiles.count('N'),
            smiles.count('Cl'), int('C' in smiles)]

user_smiles = [s.strip() for s in smiles_input.split(',')]
user_features = [smiles_to_features(s) for s in user_smiles]

predictions = model.predict(user_features)
toxicity_results = dict(zip(user_smiles, ['Toxic' if p == 1 else 'Non-Toxic' for p in predictions]))

print("\n🔬 PREDICTION RESULTS:")
for smiles, result in toxicity_results.items():
    print(f"{smiles}: {result}")