In [1]:
# Load raw features
import pandas as pd
raw_df_unlabeled = pd.read_csv('../../data/raw/original_data/zinc_unlabeled.csv')
raw_df_unlabeled.head()

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.0506,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.1137,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182


In [2]:
# Simple stats
raw_df_unlabeled.describe()

Unnamed: 0,logP,qed,SAS
count,249455.0,249455.0,249455.0
mean,2.457093,0.728264,3.053235
std,1.43433,0.139565,0.834796
min,-6.8762,0.111811,1.132738
25%,1.57481,0.645872,2.416627
50%,2.6056,0.759878,2.892993
75%,3.48676,0.835714,3.545533
max,8.2521,0.947882,7.289283


In [3]:
raw_df_labeled = pd.read_csv('../../data/raw/original_data/clintox.csv')
raw_df_labeled.head()

Unnamed: 0,smiles,FDA_APPROVED,CT_TOX
0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,1,0
1,[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C...,1,0
2,[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)...,1,0
3,[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])...,1,0
4,[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(...,1,0


In [4]:
raw_df_labeled.describe()

Unnamed: 0,FDA_APPROVED,CT_TOX
count,1484.0,1484.0
mean,0.936658,0.075472
std,0.24366,0.26424
min,0.0,0.0
25%,1.0,0.0
50%,1.0,0.0
75%,1.0,0.0
max,1.0,1.0


In [5]:
# Cell 5: Process and enhance molecular features (MODIFIED - 65% FDA Approved)
from rdkit import Chem
from rdkit.Chem import Descriptors, QED, Lipinski, Crippen, MolSurf, rdMolDescriptors
import numpy as np
import pandas as pd
from sklearn.utils import resample

# Function to canonicalize SMILES
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles.strip())
        if mol is not None:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

# Function to compute comprehensive molecular features
def compute_comprehensive_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles.strip())
        if mol is not None:
            features = {}
            
            # Basic molecular properties
            features['MolWt'] = Descriptors.MolWt(mol)
            features['LogP'] = Descriptors.MolLogP(mol)
            features['NumHDonors'] = Descriptors.NumHDonors(mol)
            features['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
            features['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
            features['NumAromaticRings'] = Descriptors.NumAromaticRings(mol)
            
            # Lipinski's Rule of Five
            features['NumHeteroatoms'] = Descriptors.NumHeteroatoms(mol)
            features['TPSA'] = Descriptors.TPSA(mol)
            
            # Complexity and shape
            features['NumRings'] = Descriptors.RingCount(mol)
            features['NumAliphaticRings'] = Descriptors.NumAliphaticRings(mol)
            features['NumSaturatedRings'] = Descriptors.NumSaturatedRings(mol)
            features['FractionCsp3'] = Descriptors.FractionCSP3(mol) 
            
            # Electronic properties
            features['NumValenceElectrons'] = Descriptors.NumValenceElectrons(mol)
            
            try:
                features['MaxPartialCharge'] = Descriptors.MaxPartialCharge(mol)
                features['MinPartialCharge'] = Descriptors.MinPartialCharge(mol)
            except:
                features['MaxPartialCharge'] = 0
                features['MinPartialCharge'] = 0
            
            # Molecular surface area
            features['LabuteASA'] = Descriptors.LabuteASA(mol)
            features['PEOE_VSA1'] = Descriptors.PEOE_VSA1(mol)
            features['PEOE_VSA2'] = Descriptors.PEOE_VSA2(mol)
            
            # Drug-likeness scores
            features['QED'] = QED.qed(mol)
            
            # Topological descriptors
            features['BertzCT'] = Descriptors.BertzCT(mol)
            features['Chi0v'] = Descriptors.Chi0v(mol)
            features['Chi1v'] = Descriptors.Chi1v(mol)
            features['Kappa1'] = Descriptors.Kappa1(mol)
            features['Kappa2'] = Descriptors.Kappa2(mol)
            
            # Additional descriptors
            features['MolMR'] = Descriptors.MolMR(mol)
            features['BalabanJ'] = Descriptors.BalabanJ(mol)
            features['HallKierAlpha'] = Descriptors.HallKierAlpha(mol)
            features['NumSaturatedCarbocycles'] = Descriptors.NumSaturatedCarbocycles(mol)
            features['NumAromaticCarbocycles'] = Descriptors.NumAromaticCarbocycles(mol)
            features['NumSaturatedHeterocycles'] = Descriptors.NumSaturatedHeterocycles(mol)
            features['NumAromaticHeterocycles'] = Descriptors.NumAromaticHeterocycles(mol)
            
            # Pharmacophore features
            features['fr_NH2'] = Descriptors.fr_NH2(mol)
            features['fr_COO'] = Descriptors.fr_COO(mol)
            features['fr_benzene'] = Descriptors.fr_benzene(mol)
            features['fr_furan'] = Descriptors.fr_furan(mol)
            features['fr_halogen'] = Descriptors.fr_halogen(mol)
            
            return pd.Series(features)
    except Exception as e:
        print(f"Error computing features: {e}") 
        pass
    return pd.Series()

# Process labeled data
print("=== Processing LABELED data ===")
print("Canonicalizing SMILES...")
raw_df_labeled['canonical_smiles'] = raw_df_labeled['smiles'].apply(canonicalize_smiles)
raw_df_labeled = raw_df_labeled.dropna(subset=['canonical_smiles'])

# Balance the dataset with 65% FDA_APPROVED, 35% NOT APPROVED
print(f"\n--- Balancing the labeled dataset (65% FDA_APPROVED, 35% Not Approved) ---")
print(f"Original distribution:")
approved_count = (raw_df_labeled['FDA_APPROVED'] == 1).sum()
not_approved_count = (raw_df_labeled['FDA_APPROVED'] == 0).sum()
print(f"FDA Approved (FDA_APPROVED=1): {approved_count}")
print(f"Not FDA Approved (FDA_APPROVED=0): {not_approved_count}")
print(f"Ratio: {not_approved_count/approved_count:.2f}:1")

# Separate approved and not approved
approved_df = raw_df_labeled[raw_df_labeled['FDA_APPROVED'] == 1]
not_approved_df = raw_df_labeled[raw_df_labeled['FDA_APPROVED'] == 0]

# Target distribution: 65% FDA Approved, 35% Not Approved
# Calculate target counts to achieve this ratio
target_ratio_approved = 0.6
target_ratio_not_approved = 0.4

# Use the smaller class to determine total size, then sample to get 65:35 ratio
# We'll use all available samples from the minority class and adjust the majority
min_class_size = min(len(approved_df), len(not_approved_df))

# Calculate target sizes
if len(approved_df) < len(not_approved_df):
    # FDA Approved is minority - use all of them (this will be our 65%)
    n_approved = len(approved_df)
    n_not_approved = int(n_approved * (target_ratio_not_approved / target_ratio_approved))
    n_not_approved = min(n_not_approved, len(not_approved_df))  # Don't exceed available
    
    approved_sampled = approved_df
    not_approved_sampled = resample(not_approved_df, 
                                     replace=False,
                                     n_samples=n_not_approved,
                                     random_state=42)
else:
    # Not Approved is minority - use all of them (this will be our 35%)
    n_not_approved = len(not_approved_df)
    n_approved = int(n_not_approved * (target_ratio_approved / target_ratio_not_approved))
    n_approved = min(n_approved, len(approved_df))  # Don't exceed available
    
    not_approved_sampled = not_approved_df
    approved_sampled = resample(approved_df,
                                replace=False,
                                n_samples=n_approved,
                                random_state=42)

# Combine and shuffle
raw_df_labeled_balanced = pd.concat([approved_sampled, not_approved_sampled])
raw_df_labeled_balanced = raw_df_labeled_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the distribution
final_approved = (raw_df_labeled_balanced['FDA_APPROVED'] == 1).sum()
final_not_approved = (raw_df_labeled_balanced['FDA_APPROVED'] == 0).sum()
final_total = len(raw_df_labeled_balanced)

print(f"\nBalanced distribution:")
print(f"FDA Approved (FDA_APPROVED=1):     {final_approved} ({final_approved/final_total*100:.1f}%)")
print(f"Not FDA Approved (FDA_APPROVED=0): {final_not_approved} ({final_not_approved/final_total*100:.1f}%)")
print(f"Total samples: {final_total}")
print(f"Target was: 65% FDA Approved, 35% Not Approved")

# Compute comprehensive features for balanced labeled molecules
print(f"\nComputing comprehensive features for {len(raw_df_labeled_balanced)} balanced labeled molecules...")
labeled_features = raw_df_labeled_balanced['canonical_smiles'].apply(compute_comprehensive_features)
all_labeled_with_features = pd.concat([raw_df_labeled_balanced, labeled_features], axis=1)
all_labeled_with_features = all_labeled_with_features.dropna()

# Save labeled data to drug_discovery folder
all_labeled_with_features.to_csv('../../data/raw/enhanced_data/drug_discovery/labeled_features.csv', index=False)

print("\n✓ Labeled data processing complete!")
print(f"✓ Balanced labeled data: {len(all_labeled_with_features)} molecules with {len(labeled_features.columns)} features")
print(f"  - FDA Approved: {(all_labeled_with_features['FDA_APPROVED'] == 1).sum()} ({(all_labeled_with_features['FDA_APPROVED'] == 1).sum()/len(all_labeled_with_features)*100:.1f}%)")
print(f"  - Not FDA Approved: {(all_labeled_with_features['FDA_APPROVED'] == 0).sum()} ({(all_labeled_with_features['FDA_APPROVED'] == 0).sum()/len(all_labeled_with_features)*100:.1f}%)")
print(f"✓ Saved to: data/raw/enhanced_data/drug_discovery/labeled_features.csv")

=== Processing LABELED data ===
Canonicalizing SMILES...


[16:21:53] Explicit valence for atom # 0 N, 4, is greater than permitted
[16:21:53] Can't kekulize mol.  Unkekulized atoms: 9
[16:21:54] Can't kekulize mol.  Unkekulized atoms: 4
[16:21:54] Can't kekulize mol.  Unkekulized atoms: 4



--- Balancing the labeled dataset (65% FDA_APPROVED, 35% Not Approved) ---
Original distribution:
FDA Approved (FDA_APPROVED=1): 1386
Not FDA Approved (FDA_APPROVED=0): 94
Ratio: 0.07:1

Balanced distribution:
FDA Approved (FDA_APPROVED=1):     140 (59.8%)
Not FDA Approved (FDA_APPROVED=0): 94 (40.2%)
Total samples: 234
Target was: 65% FDA Approved, 35% Not Approved

Computing comprehensive features for 234 balanced labeled molecules...

✓ Labeled data processing complete!
✓ Balanced labeled data: 234 molecules with 36 features
  - FDA Approved: 140 (59.8%)
  - Not FDA Approved: 94 (40.2%)
✓ Saved to: data/raw/enhanced_data/drug_discovery/labeled_features.csv


In [6]:
# Cell 6: Process unlabeled data
print("\n=== Processing UNLABELED data ===")
print("Canonicalizing SMILES...")
raw_df_unlabeled['canonical_smiles'] = raw_df_unlabeled['smiles'].apply(canonicalize_smiles)
raw_df_unlabeled = raw_df_unlabeled.dropna(subset=['canonical_smiles'])

# Compute comprehensive features for unlabeled molecules
print(f"\nComputing comprehensive features for {len(raw_df_unlabeled)} unlabeled molecules...")
unlabeled_features = raw_df_unlabeled['canonical_smiles'].apply(compute_comprehensive_features)
unlabeled_with_features = pd.concat([raw_df_unlabeled[['smiles', 'canonical_smiles']], unlabeled_features], axis=1)
unlabeled_with_features['FDA_APPROVED'] = np.nan
unlabeled_with_features['CT_TOX'] = np.nan
unlabeled_with_features = unlabeled_with_features.dropna(subset=unlabeled_features.columns.tolist())

# Save unlabeled data to drug_discovery folder
unlabeled_with_features.to_csv('../../data/raw/enhanced_data/drug_discovery/unlabeled_features.csv', index=False)

print("\n✓ Unlabeled data processing complete!")
print(f"✓ Unlabeled data: {len(unlabeled_with_features)} molecules")
print(f"✓ Saved to: data/raw/enhanced_data/drug_discovery/unlabeled_features.csv")


=== Processing UNLABELED data ===
Canonicalizing SMILES...

Computing comprehensive features for 249455 unlabeled molecules...

✓ Unlabeled data processing complete!
✓ Unlabeled data: 249453 molecules
✓ Saved to: data/raw/enhanced_data/drug_discovery/unlabeled_features.csv
