# Basic Cheminformatics with RDKit

This notebook demonstrates fundamental cheminformatics operations using RDKit.

## Learning Objectives
- Load and manipulate molecular structures
- Calculate molecular descriptors
- Visualize molecules
- Perform basic chemical transformations

In [None]:
# Import required libraries
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print(f"RDKit version: {rdkit.__version__}")

## 1. Creating and Working with Molecules

In [None]:
# Create molecules from SMILES strings
smiles_list = [
    'CCO',  # Ethanol
    'CC(=O)O',  # Acetic acid
    'CC(C)C',  # Isobutane
    'c1ccccc1',  # Benzene
    'CC(=O)Nc1ccc(O)cc1',  # Paracetamol
    'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'  # Ibuprofen
]

molecules = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        molecules.append(mol)
        print(f"Created molecule from SMILES: {smiles}")
    else:
        print(f"Failed to create molecule from SMILES: {smiles}")

print(f"\nTotal molecules created: {len(molecules)}")

## 2. Molecular Descriptors

In [None]:
# Calculate molecular descriptors
def calculate_descriptors(molecules, smiles_list):
    data = []
    
    for i, mol in enumerate(molecules):
        if mol is not None:
            descriptors = {
                'SMILES': smiles_list[i],
                'Molecular_Weight': Descriptors.MolWt(mol),
                'LogP': Descriptors.MolLogP(mol),
                'HBD': Descriptors.NumHDonors(mol),  # Hydrogen bond donors
                'HBA': Descriptors.NumHAcceptors(mol),  # Hydrogen bond acceptors
                'TPSA': Descriptors.TPSA(mol),  # Topological polar surface area
                'Rotatable_Bonds': Descriptors.NumRotatableBonds(mol),
                'Aromatic_Rings': Descriptors.NumAromaticRings(mol)
            }
            data.append(descriptors)
    
    return pd.DataFrame(data)

df_descriptors = calculate_descriptors(molecules, smiles_list)
print("Molecular Descriptors:")
print(df_descriptors.round(2))

## 3. Lipinski's Rule of Five Analysis

In [None]:
# Check Lipinski's Rule of Five
def lipinski_filter(df):
    df['Lipinski_Violations'] = 0
    
    # Rule 1: Molecular weight <= 500 Da
    df.loc[df['Molecular_Weight'] > 500, 'Lipinski_Violations'] += 1
    
    # Rule 2: LogP <= 5
    df.loc[df['LogP'] > 5, 'Lipinski_Violations'] += 1
    
    # Rule 3: Hydrogen bond donors <= 5
    df.loc[df['HBD'] > 5, 'Lipinski_Violations'] += 1
    
    # Rule 4: Hydrogen bond acceptors <= 10
    df.loc[df['HBA'] > 10, 'Lipinski_Violations'] += 1
    
    df['Drug_Like'] = df['Lipinski_Violations'] <= 1
    
    return df

df_descriptors = lipinski_filter(df_descriptors)
print("\nLipinski's Rule of Five Analysis:")
print(df_descriptors[['SMILES', 'Lipinski_Violations', 'Drug_Like']])

## Exercise

Try the following:
1. Add more drug molecules to the analysis
2. Calculate additional descriptors using `Descriptors` module
3. Implement other drug-likeness rules (e.g., Veber's rules)
4. Create molecular fingerprints for similarity analysis

## 🚀 ChemML Hybrid Integration Demo

This section demonstrates ChemML's **hybrid architecture** that combines:
- **Standard libraries** (RDKit, DeepChem) for maximum compatibility
- **Custom implementations** for advanced features and modern APIs
- **Legacy modules** for production-ready functionality

### Why Hybrid Approach?
- ✅ **Best of both worlds**: Flexibility + Robustness
- ✅ **Future-proof**: Modern APIs without deprecation warnings
- ✅ **Production-ready**: Battle-tested legacy functionality
- ✅ **Educational**: Clear progression from basics to advanced

In [None]:
# ChemML Hybrid Architecture Demo
print("🧬 ChemML Hybrid Architecture Demonstration")
print("=" * 50)

# 1. Standard approach (what we've been doing)
print("\n📚 1. STANDARD APPROACH (RDKit directly)")
from rdkit import Chem
from rdkit.Chem import Descriptors

mol = Chem.MolFromSmiles('CCO')
mw = Descriptors.MolWt(mol)
print(f"   Molecular weight: {mw:.2f}")

# 2. ChemML modern featurizers (new architecture)
print("\n🆕 2. CHEMML MODERN FEATURIZERS")
try:
    from chemml.core.featurizers import MorganFingerprint, DescriptorCalculator
    
    # Modern Morgan fingerprints (no deprecation warnings)
    morgan = MorganFingerprint(radius=2, n_bits=1024)
    features = morgan.featurize([mol])
    print(f"   Morgan fingerprint shape: {features.shape}")
    
    # Modern descriptor calculator
    desc_calc = DescriptorCalculator()
    descriptors = desc_calc.featurize([mol])
    print(f"   Descriptors shape: {descriptors.shape}")
    
except ImportError:
    print("   ChemML featurizers not available")

# 3. Legacy module integration (production-ready functionality)
print("\n🔧 3. LEGACY MODULE INTEGRATION")
try:
    from chemml.core.data import legacy_molecular_cleaning, enhanced_property_prediction
    import pandas as pd
    
    # Test data cleaning
    test_data = pd.DataFrame({
        'smiles': ['CCO', 'CC(C)O', 'invalid', 'c1ccccc1'],
        'activity': [1.2, 2.3, None, 0.8]
    })
    
    cleaned = legacy_molecular_cleaning(test_data)
    print(f"   Data cleaning: {len(test_data)} -> {len(cleaned)} molecules")
    
    # Property prediction
    predictor = enhanced_property_prediction()
    if predictor:
        print("   ✅ Property predictor available")
    
except ImportError:
    print("   Legacy integration not available")

print("\n🎯 INTEGRATION COMPLETE!")
print("✅ All three approaches work together seamlessly!")