# 🧪 ToxiChem: QSAR Model Development
Train ML model to predict hERG inhibition

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import joblib

In [ ]:
# Синтетические данные
data = pd.DataFrame({
    'smiles': [
        'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # кофеин
        'CC(=O)OC1=CC=CC=C1C(=O)O',     # аспирин
        'C1=CC=C(C=C1)C=O'              # бензальдегид
    ],
    'hERG_risk': [0, 0, 1]
})

In [ ]:
def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Lipinski.HBondDonorCount(mol),
        Lipinski.HBondAcceptorCount(mol),
        Descriptors.TPSA(mol)
    ]

X = np.array([featurize(sm) for sm in data['smiles']])
y = data['hERG_risk']

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

preds = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]
print('AUC:', roc_auc_score(y_test, proba))

In [ ]:
joblib.dump(model, '../backend/ml_models/hERG_model.pkl')