In [1]:
# Cell 1: Load unlabeled data
# Load raw features
import pandas as pd
raw_df_unlabeled = pd.read_csv('../../data/raw/original_data/zinc_unlabeled.csv')
raw_df_unlabeled.head()

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.0506,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.1137,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182


In [2]:
# Simple stats
raw_df_unlabeled.describe()

Unnamed: 0,logP,qed,SAS
count,249455.0,249455.0,249455.0
mean,2.457093,0.728264,3.053235
std,1.43433,0.139565,0.834796
min,-6.8762,0.111811,1.132738
25%,1.57481,0.645872,2.416627
50%,2.6056,0.759878,2.892993
75%,3.48676,0.835714,3.545533
max,8.2521,0.947882,7.289283


In [3]:
# Cell 3: Load labeled data
raw_df_labeled = pd.read_csv('../../data/raw/original_data/tox21.csv')
raw_df_labeled.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [4]:
raw_df_labeled.describe()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
count,7265.0,6758.0,6549.0,5821.0,6193.0,6955.0,6450.0,5832.0,7072.0,6467.0,5810.0,6774.0
mean,0.042533,0.03507,0.11727,0.051538,0.128048,0.050324,0.028837,0.161523,0.03733,0.057523,0.158003,0.062445
std,0.201815,0.183969,0.321766,0.22111,0.33417,0.218627,0.167362,0.368044,0.189583,0.232857,0.364776,0.241979
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Cell 5: Process labeled data
from rdkit import Chem
from rdkit.Chem import Descriptors, QED, Lipinski, Crippen, MolSurf, rdMolDescriptors
import numpy as np
import pandas as pd
from sklearn.utils import resample

# Function to canonicalize SMILES
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles.strip())
        if mol is not None:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

# Function to compute comprehensive molecular features
def compute_comprehensive_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles.strip())
        if mol is not None:
            features = {}
            
            # Basic molecular properties
            features['MolWt'] = Descriptors.MolWt(mol)
            features['LogP'] = Descriptors.MolLogP(mol)
            features['NumHDonors'] = Descriptors.NumHDonors(mol)
            features['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
            features['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
            features['NumAromaticRings'] = Descriptors.NumAromaticRings(mol)
            
            # Lipinski's Rule of Five
            features['NumHeteroatoms'] = Descriptors.NumHeteroatoms(mol)
            features['TPSA'] = Descriptors.TPSA(mol)
            
            # Complexity and shape
            features['NumRings'] = Descriptors.RingCount(mol)
            features['NumAliphaticRings'] = Descriptors.NumAliphaticRings(mol)
            features['NumSaturatedRings'] = Descriptors.NumSaturatedRings(mol)
            features['FractionCsp3'] = Descriptors.FractionCSP3(mol) 
            
            # Electronic properties
            features['NumValenceElectrons'] = Descriptors.NumValenceElectrons(mol)
            
            try:
                features['MaxPartialCharge'] = Descriptors.MaxPartialCharge(mol)
                features['MinPartialCharge'] = Descriptors.MinPartialCharge(mol)
            except:
                features['MaxPartialCharge'] = 0
                features['MinPartialCharge'] = 0
            
            # Molecular surface area
            features['LabuteASA'] = Descriptors.LabuteASA(mol)
            features['PEOE_VSA1'] = Descriptors.PEOE_VSA1(mol)
            features['PEOE_VSA2'] = Descriptors.PEOE_VSA2(mol)
            
            # Drug-likeness scores
            features['QED'] = QED.qed(mol)
            
            # Topological descriptors
            features['BertzCT'] = Descriptors.BertzCT(mol)
            features['Chi0v'] = Descriptors.Chi0v(mol)
            features['Chi1v'] = Descriptors.Chi1v(mol)
            features['Kappa1'] = Descriptors.Kappa1(mol)
            features['Kappa2'] = Descriptors.Kappa2(mol)
            
            # Additional descriptors
            features['MolMR'] = Descriptors.MolMR(mol)
            features['BalabanJ'] = Descriptors.BalabanJ(mol)
            features['HallKierAlpha'] = Descriptors.HallKierAlpha(mol)
            features['NumSaturatedCarbocycles'] = Descriptors.NumSaturatedCarbocycles(mol)
            features['NumAromaticCarbocycles'] = Descriptors.NumAromaticCarbocycles(mol)
            features['NumSaturatedHeterocycles'] = Descriptors.NumSaturatedHeterocycles(mol)
            features['NumAromaticHeterocycles'] = Descriptors.NumAromaticHeterocycles(mol)
            
            # Pharmacophore features
            features['fr_NH2'] = Descriptors.fr_NH2(mol)
            features['fr_COO'] = Descriptors.fr_COO(mol)
            features['fr_benzene'] = Descriptors.fr_benzene(mol)
            features['fr_furan'] = Descriptors.fr_furan(mol)
            features['fr_halogen'] = Descriptors.fr_halogen(mol)
            
            return pd.Series(features)
    except Exception as e:
        print(f"Error computing features: {e}") 
        pass
    return pd.Series()

# Process labeled data
print("=== Processing LABELED data ===")
print("Canonicalizing SMILES...")
raw_df_labeled['canonical_smiles'] = raw_df_labeled['smiles'].apply(canonicalize_smiles)
raw_df_labeled = raw_df_labeled.dropna(subset=['canonical_smiles'])

# Define toxicity columns
tox_columns = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 
               'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']

print(f"\n--- Creating toxic column using OR logic ---")
# Create toxic column using OR logic: if ANY toxicity column is 1, the molecule is toxic
raw_df_labeled['toxic'] = raw_df_labeled[tox_columns].max(axis=1)

# Drop rows where all toxicity columns are NaN (toxic will be NaN)
raw_df_labeled = raw_df_labeled.dropna(subset=['toxic'])

# Convert to int for clarity
raw_df_labeled['toxic'] = raw_df_labeled['toxic'].astype(int)

# Drop the 12 individual toxicity columns after creating the unified 'toxic' column
print(f"Dropping {len(tox_columns)} individual toxicity columns...")
raw_df_labeled = raw_df_labeled.drop(columns=tox_columns)

# Balance the dataset BEFORE computing features
print(f"\n--- Balancing the labeled dataset ---")
print(f"Original distribution:")
toxic_count = (raw_df_labeled['toxic'] == 1).sum()
non_toxic_count = (raw_df_labeled['toxic'] == 0).sum()
print(f"Toxic (toxic=1): {toxic_count}")
print(f"Non-toxic (toxic=0): {non_toxic_count}")
print(f"Ratio: {non_toxic_count/toxic_count:.2f}:1")

# Separate toxic and non-toxic
toxic_df = raw_df_labeled[raw_df_labeled['toxic'] == 1]
non_toxic_df = raw_df_labeled[raw_df_labeled['toxic'] == 0]

# Downsample non-toxic to match toxic count (NO DUPLICATES)
non_toxic_downsampled = resample(non_toxic_df, 
                                  replace=False,  # No replacement = no duplicates
                                  n_samples=len(toxic_df),  # Match toxic count
                                  random_state=42)

# Combine balanced data
raw_df_labeled_balanced = pd.concat([toxic_df, non_toxic_downsampled])
raw_df_labeled_balanced = raw_df_labeled_balanced.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\nBalanced distribution:")
print(f"Toxic (toxic=1): {(raw_df_labeled_balanced['toxic'] == 1).sum()}")
print(f"Non-toxic (toxic=0): {(raw_df_labeled_balanced['toxic'] == 0).sum()}")
print(f"Total samples: {len(raw_df_labeled_balanced)}")

# Compute comprehensive features for balanced labeled molecules
print(f"\nComputing comprehensive features for {len(raw_df_labeled_balanced)} balanced labeled molecules...")
labeled_features = raw_df_labeled_balanced['canonical_smiles'].apply(compute_comprehensive_features)
all_labeled_with_features = pd.concat([raw_df_labeled_balanced, labeled_features], axis=1)
all_labeled_with_features = all_labeled_with_features.dropna()

# Save labeled data
all_labeled_with_features.to_csv('../../data/raw/enhanced_data/tox21/labeled_features.csv', index=False)

print("\n✓ Labeled data processing complete!")
print(f"✓ Balanced labeled data: {len(all_labeled_with_features)} molecules with {len(labeled_features.columns)} features")
print(f"  - Toxic: {(all_labeled_with_features['toxic'] == 1).sum()}")
print(f"  - Non-toxic: {(all_labeled_with_features['toxic'] == 0).sum()}")
print(f"✓ Saved to: data/raw/enhanced_data/tox21/labeled_features.csv")

=== Processing LABELED data ===
Canonicalizing SMILES...


[09:50:40] Explicit valence for atom # 8 Al, 6, is greater than permitted
[09:50:40] Explicit valence for atom # 3 Al, 6, is greater than permitted
[09:50:40] Explicit valence for atom # 4 Al, 6, is greater than permitted
[09:50:41] Explicit valence for atom # 4 Al, 6, is greater than permitted
[09:50:41] Explicit valence for atom # 9 Al, 6, is greater than permitted
[09:50:41] Explicit valence for atom # 5 Al, 6, is greater than permitted
[09:50:41] Explicit valence for atom # 16 Al, 6, is greater than permitted
[09:50:41] Explicit valence for atom # 20 Al, 6, is greater than permitted



--- Creating toxic column using OR logic ---
Dropping 12 individual toxicity columns...

--- Balancing the labeled dataset ---
Original distribution:
Toxic (toxic=1): 2869
Non-toxic (toxic=0): 4954
Ratio: 1.73:1

Balanced distribution:
Toxic (toxic=1): 2869
Non-toxic (toxic=0): 2869
Total samples: 5738

Computing comprehensive features for 5738 balanced labeled molecules...





✓ Labeled data processing complete!
✓ Balanced labeled data: 5661 molecules with 36 features
  - Toxic: 2817
  - Non-toxic: 2844
✓ Saved to: data/raw/enhanced_data/tox21/labeled_features.csv


In [None]:
# Cell 6: Process unlabeled data
# Process unlabeled data
print("\n=== Processing UNLABELED data ===")
print("Canonicalizing SMILES...")
raw_df_unlabeled['canonical_smiles'] = raw_df_unlabeled['smiles'].apply(canonicalize_smiles)
raw_df_unlabeled = raw_df_unlabeled.dropna(subset=['canonical_smiles'])

# Compute comprehensive features for unlabeled molecules
print(f"\nComputing comprehensive features for {len(raw_df_unlabeled)} unlabeled molecules...")
unlabeled_features = raw_df_unlabeled['canonical_smiles'].apply(compute_comprehensive_features)
unlabeled_with_features = pd.concat([raw_df_unlabeled[['smiles', 'canonical_smiles']], unlabeled_features], axis=1)
unlabeled_with_features['toxic'] = np.nan
unlabeled_with_features = unlabeled_with_features.dropna(subset=unlabeled_features.columns.tolist())

# Save unlabeled data
unlabeled_with_features.to_csv('../../data/raw/enhanced_data/tox21/unlabeled_features.csv', index=False)

print("\n✓ Unlabeled data processing complete!")
print(f"✓ Unlabeled data: {len(unlabeled_with_features)} molecules with {len(unlabeled_features.columns)} features")
print(f"✓ Saved to: data/raw/enhanced_data/tox21/unlabeled_features.csv")


=== Processing UNLABELED data ===
Canonicalizing SMILES...

Computing comprehensive features for 249455 unlabeled molecules...

✓ Unlabeled data processing complete!
✓ Unlabeled data: 249453 molecules with 36 features
✓ Saved to: data/raw/enhanced_data/tox21/unlabeled_features.csv
