# Week 4 Checkpoint: Advanced Molecular Descriptors and Feature Engineering

## Learning Objectives Verification
By the end of this week, you should be able to:
- Calculate and interpret advanced molecular descriptors
- Implement feature selection and dimensionality reduction techniques
- Apply feature engineering strategies for molecular data
- Evaluate descriptor importance and correlation patterns

## Progress Tracking Dashboard
**Week:** 4/12  
**Module:** Advanced Molecular Descriptors and Feature Engineering  
**Estimated Time:** 8-12 hours  
**Prerequisites:** Weeks 1-3 completed  

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, Crippen, rdMolDescriptors
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Progress tracking
progress_tracker = {
    'week': 4,
    'completed_tasks': [],
    'scores': {},
    'time_spent': 0,
    'challenges_faced': [],
    'next_steps': []
}

print("Week 4 Checkpoint: Advanced Molecular Descriptors and Feature Engineering")
print("=" * 70)

## Task 1: Advanced Descriptor Calculation (25 points)

Calculate a comprehensive set of molecular descriptors for a dataset of drug molecules.

In [None]:
# Sample drug molecules (SMILES)
drug_smiles = [
    'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',  # Ibuprofen
    'CC1=CC=C(C=C1)C(=O)C2=CC=C(C=C2)N(C)C',  # Michler's ketone
    'CN1CCC[C@H]1C2=CN=CC=C2',  # Nicotine
    'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
    'CN(C)CCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl',  # Chlorpromazine
    'CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)CF3',  # Mefloquine
    'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
    'CC(C)(C)NCC(C1=CC(=C(C=C1)O)CO)O',  # Salbutamol
    'CN(C)C(=N)NC(=N)N',  # Metformin
    'CC1=C(C=C(C=C1)N2CCNCC2)C'  # Methylpiperazine derivative
]

def calculate_advanced_descriptors(smiles_list):
    """
    Calculate comprehensive molecular descriptors
    """
    descriptors_data = []
    
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            continue
            
        # Basic descriptors
        desc_dict = {
            'SMILES': smiles,
            'MW': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'HBD': Descriptors.NumHDonors(mol),
            'HBA': Descriptors.NumHAcceptors(mol),
            'RotBonds': Descriptors.NumRotatableBonds(mol),
            'TPSA': Descriptors.TPSA(mol),
            'AromaticRings': Descriptors.NumAromaticRings(mol),
            'SaturatedRings': Descriptors.NumSaturatedRings(mol),
            'HeteroAtoms': Descriptors.NumHeteroatoms(mol),
            'FractionCsp3': Descriptors.FractionCsp3(mol),
            'MolecularComplexity': Descriptors.BertzCT(mol),
            'Flexibility': Descriptors.Kappa3(mol)
        }
        
        # Lipinski's Rule of Five compliance
        desc_dict['Lipinski_Violations'] = sum([
            desc_dict['MW'] > 500,
            desc_dict['LogP'] > 5,
            desc_dict['HBD'] > 5,
            desc_dict['HBA'] > 10
        ])
        
        # Drug-likeness indicators
        desc_dict['QED'] = Descriptors.qed(mol)  # Quantitative Estimate of Drug-likeness
        
        descriptors_data.append(desc_dict)
    
    return pd.DataFrame(descriptors_data)

# Task 1: Calculate descriptors
print("Task 1: Calculating advanced molecular descriptors...")
descriptor_df = calculate_advanced_descriptors(drug_smiles)

# Display results
print(f"\nCalculated {len(descriptor_df.columns)-1} descriptors for {len(descriptor_df)} molecules")
print("\nDescriptor Summary:")
print(descriptor_df.describe().round(3))

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
descriptor_df.hist(column=['MW', 'LogP', 'TPSA', 'QED'], bins=10, ax=axes, alpha=0.7)
plt.tight_layout()
plt.title('Distribution of Key Molecular Descriptors')
plt.show()

progress_tracker['completed_tasks'].append('Task 1: Advanced Descriptor Calculation')
progress_tracker['scores']['task_1'] = 25  # Full points for completion

## Task 2: Feature Selection and Correlation Analysis (25 points)

Analyze descriptor correlations and implement feature selection techniques.

In [None]:
# Task 2: Feature selection and correlation analysis
print("Task 2: Feature selection and correlation analysis...")

# Prepare data for analysis (excluding SMILES)
feature_df = descriptor_df.drop('SMILES', axis=1)

# Correlation matrix
correlation_matrix = feature_df.corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Molecular Descriptor Correlation Matrix')
plt.tight_layout()
plt.show()

# Identify highly correlated features
def find_correlated_features(corr_matrix, threshold=0.8):
    """
    Find pairs of features with correlation above threshold
    """
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    return high_corr_pairs

high_corr = find_correlated_features(correlation_matrix)
print(f"\nFound {len(high_corr)} highly correlated feature pairs (|r| > 0.8):")
for feat1, feat2, corr in high_corr:
    print(f"  {feat1} - {feat2}: {corr:.3f}")

# Feature importance using Random Forest
# Use QED as target variable for demonstration
X = feature_df.drop('QED', axis=1)
y = feature_df['QED']

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Feature importance plot
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance for QED Prediction')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print("\nTop 5 most important features for QED prediction:")
for i, row in feature_importance.head().iterrows():
    print(f"  {row['feature']}: {row['importance']:.3f}")

progress_tracker['completed_tasks'].append('Task 2: Feature Selection and Correlation Analysis')
progress_tracker['scores']['task_2'] = 25

## Task 3: Dimensionality Reduction with PCA (25 points)

Apply Principal Component Analysis to reduce descriptor dimensionality.

In [None]:
# Task 3: Dimensionality reduction with PCA
print("Task 3: Dimensionality reduction with PCA...")

# Standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
explained_var_ratio = pca.explained_variance_ratio_
cumulative_var_ratio = np.cumsum(explained_var_ratio)

# Plot explained variance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Individual explained variance
ax1.bar(range(1, len(explained_var_ratio)+1), explained_var_ratio)
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Individual Explained Variance by PC')
ax1.set_xticks(range(1, len(explained_var_ratio)+1))

# Cumulative explained variance
ax2.plot(range(1, len(cumulative_var_ratio)+1), cumulative_var_ratio, 'bo-')
ax2.axhline(y=0.9, color='r', linestyle='--', label='90% Variance')
ax2.axhline(y=0.95, color='g', linestyle='--', label='95% Variance')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Explained Variance Ratio')
ax2.set_title('Cumulative Explained Variance')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_xticks(range(1, len(cumulative_var_ratio)+1))

plt.tight_layout()
plt.show()

# Find number of components for 90% and 95% variance
n_components_90 = np.argmax(cumulative_var_ratio >= 0.90) + 1
n_components_95 = np.argmax(cumulative_var_ratio >= 0.95) + 1

print(f"\nPCA Analysis Results:")
print(f"  Original features: {X.shape[1]}")
print(f"  Components for 90% variance: {n_components_90}")
print(f"  Components for 95% variance: {n_components_95}")
print(f"  Variance captured by first 3 PCs: {cumulative_var_ratio[2]:.1%}")

# Feature contributions to first 3 PCs
components_df = pd.DataFrame(
    pca.components_[:3].T,
    columns=['PC1', 'PC2', 'PC3'],
    index=X.columns
)

# Plot feature loadings
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for i, pc in enumerate(['PC1', 'PC2', 'PC3']):
    loadings = components_df[pc].abs().sort_values(ascending=True)
    loadings.plot(kind='barh', ax=axes[i])
    axes[i].set_title(f'{pc} Feature Loadings')
    axes[i].set_xlabel('Absolute Loading')

plt.tight_layout()
plt.show()

print("\nTop 3 features contributing to each PC:")
for pc in ['PC1', 'PC2', 'PC3']:
    top_features = components_df[pc].abs().nlargest(3)
    print(f"  {pc}: {', '.join(top_features.index)}")

progress_tracker['completed_tasks'].append('Task 3: Dimensionality Reduction with PCA')
progress_tracker['scores']['task_3'] = 25

## Task 4: Feature Engineering Challenge (25 points)

Create new features and evaluate their impact on model performance.

In [None]:
# Task 4: Feature engineering challenge
print("Task 4: Feature engineering challenge...")

def engineer_new_features(df):
    """
    Create new engineered features from existing descriptors
    """
    engineered_df = df.copy()
    
    # Ratio features
    engineered_df['HBA_HBD_ratio'] = engineered_df['HBA'] / (engineered_df['HBD'] + 1e-6)
    engineered_df['MW_TPSA_ratio'] = engineered_df['MW'] / (engineered_df['TPSA'] + 1e-6)
    engineered_df['Aromatic_Total_rings'] = engineered_df['AromaticRings'] / (engineered_df['AromaticRings'] + engineered_df['SaturatedRings'] + 1e-6)
    
    # Interaction features
    engineered_df['LogP_MW_interaction'] = engineered_df['LogP'] * engineered_df['MW']
    engineered_df['TPSA_RotBonds_interaction'] = engineered_df['TPSA'] * engineered_df['RotBonds']
    
    # Binned features
    engineered_df['MW_category'] = pd.cut(engineered_df['MW'], 
                                         bins=[0, 200, 350, 500, float('inf')], 
                                         labels=['Small', 'Medium', 'Large', 'VeryLarge'])
    
    # Drug-likeness score
    engineered_df['Custom_Druglikeness'] = (
        (engineered_df['MW'] <= 500) * 0.25 +
        (engineered_df['LogP'] <= 5) * 0.25 +
        (engineered_df['HBD'] <= 5) * 0.25 +
        (engineered_df['TPSA'] <= 140) * 0.25
    )
    
    return engineered_df

# Apply feature engineering
engineered_features = engineer_new_features(feature_df)

print(f"\nOriginal features: {len(feature_df.columns)}")
print(f"Features after engineering: {len(engineered_features.columns)}")
print(f"New features created: {len(engineered_features.columns) - len(feature_df.columns)}")

# Evaluate impact of new features on QED prediction
def evaluate_feature_impact(original_df, engineered_df, target_col='QED'):
    """
    Compare model performance with original vs engineered features
    """
    results = {}
    
    for name, df in [('Original', original_df), ('Engineered', engineered_df)]:
        # Prepare data
        X = df.drop([target_col], axis=1)
        if 'MW_category' in X.columns:
            X = pd.get_dummies(X, columns=['MW_category'])
        y = df[target_col]
        
        # Train model
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X, y)
        
        # Calculate R² score
        score = rf.score(X, y)
        results[name] = {
            'n_features': X.shape[1],
            'r2_score': score
        }
    
    return results

# Evaluate feature impact
impact_results = evaluate_feature_impact(feature_df, engineered_features)

print("\nFeature Engineering Impact Assessment:")
for name, results in impact_results.items():
    print(f"  {name}:")
    print(f"    Features: {results['n_features']}")
    print(f"    R² Score: {results['r2_score']:.4f}")

improvement = impact_results['Engineered']['r2_score'] - impact_results['Original']['r2_score']
print(f"\nImprovement: {improvement:.4f} ({improvement/impact_results['Original']['r2_score']*100:.1f}%)")

# Visualize new feature distributions
new_features = ['HBA_HBD_ratio', 'MW_TPSA_ratio', 'Custom_Druglikeness']
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, feature in enumerate(new_features):
    engineered_features[feature].hist(bins=10, ax=axes[i], alpha=0.7, color='skyblue')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

progress_tracker['completed_tasks'].append('Task 4: Feature Engineering Challenge')
progress_tracker['scores']['task_4'] = 25

## Self-Assessment and Reflection

Complete this self-assessment to evaluate your understanding of advanced molecular descriptors and feature engineering.

In [None]:
# Self-Assessment Questions
print("SELF-ASSESSMENT: Advanced Molecular Descriptors and Feature Engineering")
print("=" * 70)

assessment_questions = [
    {
        'question': 'What is the primary purpose of molecular descriptors in drug discovery?',
        'options': ['A) To visualize molecules', 'B) To quantify molecular properties for analysis', 
                   'C) To store chemical structures', 'D) To name compounds'],
        'correct': 'B',
        'explanation': 'Molecular descriptors quantify chemical and physical properties of molecules for computational analysis.'
    },
    {
        'question': 'Which descriptor is most important for assessing membrane permeability?',
        'options': ['A) Molecular weight', 'B) Number of rotatable bonds', 
                   'C) Topological polar surface area (TPSA)', 'D) Number of rings'],
        'correct': 'C',
        'explanation': 'TPSA is strongly correlated with membrane permeability and oral bioavailability.'
    },
    {
        'question': 'What does a high correlation (>0.8) between two descriptors indicate?',
        'options': ['A) They are independent', 'B) They provide redundant information', 
                   'C) One is more important', 'D) They should always be used together'],
        'correct': 'B',
        'explanation': 'High correlation indicates redundancy, suggesting one descriptor could be removed.'
    },
    {
        'question': 'What is the main advantage of using PCA for descriptor reduction?',
        'options': ['A) It removes noise', 'B) It identifies the most important features', 
                   'C) It creates uncorrelated components', 'D) It improves model accuracy'],
        'correct': 'C',
        'explanation': 'PCA creates orthogonal (uncorrelated) components while retaining maximum variance.'
    },
    {
        'question': 'When engineering ratio features, why add a small epsilon (1e-6) to the denominator?',
        'options': ['A) To improve accuracy', 'B) To prevent division by zero', 
                   'C) To normalize the data', 'D) To add noise'],
        'correct': 'B',
        'explanation': 'Adding epsilon prevents division by zero errors when the denominator is zero.'
    }
]

score = 0
for i, q in enumerate(assessment_questions, 1):
    print(f"\nQuestion {i}: {q['question']}")
    for option in q['options']:
        print(f"  {option}")
    
    # For demonstration, we'll show the correct answer
    print(f"\nCorrect Answer: {q['correct']}")
    print(f"Explanation: {q['explanation']}")
    score += 1  # Assuming correct for progress tracking

assessment_score = (score / len(assessment_questions)) * 100
progress_tracker['scores']['self_assessment'] = assessment_score

print(f"\nSelf-Assessment Score: {assessment_score:.0f}%")

## Week 4 Progress Summary and Next Steps

In [None]:
# Calculate overall progress
total_score = sum(progress_tracker['scores'].values())
max_score = 125  # 4 tasks × 25 points + 25 points assessment
overall_percentage = (total_score / max_score) * 100

progress_tracker['overall_score'] = overall_percentage
progress_tracker['time_spent'] = 10  # Estimated hours

print("WEEK 4 PROGRESS SUMMARY")
print("=" * 50)
print(f"Overall Score: {overall_percentage:.1f}%")
print(f"Time Spent: {progress_tracker['time_spent']} hours")
print(f"Tasks Completed: {len(progress_tracker['completed_tasks'])}/4")

print("\nTask Breakdown:")
for task, score in progress_tracker['scores'].items():
    print(f"  {task}: {score}/25 points")

print("\nKey Learning Outcomes Achieved:")
outcomes = [
    "✓ Calculated comprehensive molecular descriptors",
    "✓ Analyzed feature correlations and redundancy",
    "✓ Applied PCA for dimensionality reduction",
    "✓ Engineered new features and evaluated impact",
    "✓ Understood descriptor importance for drug-likeness"
]
for outcome in outcomes:
    print(f"  {outcome}")

print("\nNext Week (Week 5) Preview:")
print("  📚 Topic: Machine Learning Fundamentals for Drug Discovery")
print("  🎯 Focus: Regression and classification models")
print("  💡 Skills: Model selection, validation, hyperparameter tuning")
print("  🔬 Practice: Build QSAR models for bioactivity prediction")

# Portfolio development checkpoint
print("\nPortfolio Development Checkpoint:")
print("  📊 Add descriptor analysis to your multi-target project")
  print("  🔧 Implement feature engineering pipeline")
print("  📈 Document feature selection rationale")
print("  🔍 Compare descriptor sets across targets")

# Save progress
import json
with open('week_04_progress.json', 'w') as f:
    json.dump(progress_tracker, f, indent=2)

print("\n✅ Week 4 checkpoint completed! Progress saved to week_04_progress.json")
print("📝 Remember to update your learning journal with key insights")
print("🚀 Ready to move on to Week 5: Machine Learning Fundamentals!")