In [2]:
from rdkit import Chem
from rdkit.Chem import PandasTools, Descriptors
import pandas as pd

# Path to the SDF file
sdf_path = r"C:\Users\avani\Downloads\tox21_10k_data_all.sdf (1)\tox21_10k_data_all.sdf"

# Load the SDF file
supplier = Chem.SDMolSupplier(sdf_path)

# Initialize lists to hold molecule data
molecule_data = []

# Iterate through molecules
for mol in supplier:
    if mol is None:
        continue  # Skip invalid molecules

    # Extract SMILES, molecule descriptors, and toxicity labels
    smiles = Chem.MolToSmiles(mol)
    molecular_weight = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)

    # Extract toxicity labels from molecule properties
    toxicity_label = mol.GetProp("_Name") if mol.HasProp("_Name") else None  # Example label extraction

    # Append the data
    molecule_data.append({
        "SMILES": smiles,
        "MolecularWeight": molecular_weight,
        "LogP": logp,
        "NumHDonors": h_donors,
        "NumHAcceptors": h_acceptors,
        "ToxicityLabel": toxicity_label
    })

# Convert to a DataFrame for easier handling
df = pd.DataFrame(molecule_data)

# Display the first few rows
print(df.head())


[23:00:37] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 10
[23:00:37] Explicit valence for atom # 3 Cl, 2, is greater than permitted
[23:00:37] ERROR: Could not sanitize molecule ending on line 21572
[23:00:37] ERROR: Explicit valence for atom # 3 Cl, 2, is greater than permitted
[23:00:46] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 10
[23:00:53] Explicit valence for atom # 2 Si, 8, is greater than permitted
[23:00:53] ERROR: Could not sanitize molecule ending on line 346021
[23:00:53] ERROR: Explicit valence for atom # 2 Si, 8, is greater than permitted
[23:00:57] Explicit valence for atom # 3 Cl, 2, is greater than permitted
[23:00:57] ERROR: Could not sanitize molecule ending on line 446665
[23:00:57] ERROR: Explicit valence for atom # 3 Cl, 2, is greater than permitted
[23:00:57] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 10
[23:01:03] Explicit valence for atom # 1 Cl, 2, is greater than permitted
[23:01:03

                                              SMILES  MolecularWeight  \
0  C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Nc1ccc2cc3ccc(N...          468.992   
1  O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...          691.859   
2  CO[C@@H]1[C@@H](OC)[C@H](C)[C@@](O)(CC(=O)[O-]...          934.171   
3  CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...          927.020   
4  CC(=O)O.CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=...         1342.527   

      LogP  NumHDonors  NumHAcceptors    ToxicityLabel  
0  1.53830           4              5  NCGC00178831-03  
1 -0.94010           0              5  NCGC00166114-03  
2  3.35840           4             17  NCGC00263563-01  
3  3.58460           4             10  NCGC00013058-02  
4 -0.45963          18             15  NCGC00167516-01  


In [3]:
# Example: Convert textual toxicity labels to binary values
df['ToxicityLabel'] = df['ToxicityLabel'].apply(lambda x: 1 if x == "toxic" else 0)

# Drop rows with missing data
df.dropna(inplace=True)

# Display the processed data
print(df.head())


                                              SMILES  MolecularWeight  \
0  C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Nc1ccc2cc3ccc(N...          468.992   
1  O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...          691.859   
2  CO[C@@H]1[C@@H](OC)[C@H](C)[C@@](O)(CC(=O)[O-]...          934.171   
3  CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)c...          927.020   
4  CC(=O)O.CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=...         1342.527   

      LogP  NumHDonors  NumHAcceptors  ToxicityLabel  
0  1.53830           4              5              0  
1 -0.94010           0              5              0  
2  3.35840           4             17              0  
3  3.58460           4             10              0  
4 -0.45963          18             15              0  


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define features and target
X = df[['MolecularWeight', 'LogP', 'NumHDonors', 'NumHAcceptors']]  # Features
y = df['ToxicityLabel']  # Target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))
import joblib  # For saving and loading the model

# Save the trained model
joblib.dump(clf, 'random_forest_classifier_modeltoxicity.pkl')
print("Model saved as 'random_forest_classifier_modeltoxicity.pkl'")
