In [None]:
# ChemML Integration Setupimport chemmlprint(f'🧪 ChemML {chemml.__version__} loaded for this notebook')

# Week 9 Checkpoint: Advanced Applications and Case Studies

## Learning Objectives
- Apply integrated computational approaches to real drug discovery problems
- Analyze complex pharmaceutical case studies
- Develop end-to-end computational workflows
- Validate and interpret multi-scale modeling results

## Progress Tracking Variables

In [None]:
# Week 9 Progress Tracking
week_number = 9
week_topic = "Advanced Applications and Case Studies"
total_points = 100
tasks_completed = 0
current_score = 0

# Task completion tracking
task_scores = {
    'task_1_covid_case_study': 0,
    'task_2_cancer_drug_design': 0,
    'task_3_personalized_medicine': 0,
    'task_4_regulatory_modeling': 0
}

# Skills assessment
skills_developed = {
    'case_study_analysis': False,
    'integrated_workflows': False,
    'personalized_approaches': False,
    'regulatory_compliance': False
}

print(f"Week {week_number}: {week_topic}")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 1: COVID Drug Discovery Case Study (25 points)

Analyze SARS-CoV-2 main protease (Mpro) inhibitor discovery using computational methods.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
import seaborn as sns

class COVIDDrugDiscovery:
    """COVID-19 drug discovery case study analysis"""
    
    def __init__(self):
        self.mpro_inhibitors = []
        self.binding_affinities = []
        self.drug_properties = {}
        self.screening_results = {}
    
    def generate_mpro_dataset(self, n_compounds=1000):
        """Generate synthetic Mpro inhibitor dataset"""
        
        # Known Mpro inhibitor scaffolds (simplified)
        scaffolds = [
            'CC(C)CC(NC(=O)C(F)(F)F)C(=O)N',  # Peptidomimetic
            'Cc1ccc(S(=O)(=O)N)cc1',  # Sulfonamide
            'NC(=O)c1ccc(Cl)cc1',  # Benzamide
            'CC(=O)Nc1ccc(O)cc1',  # Acetaminophen-like
            'COc1ccc(CC(=O)N)cc1'   # Phenylacetamide
        ]
        
        compounds = []
        activities = []
        
        for i in range(n_compounds):
            # Create variations of scaffolds
            base_scaffold = np.random.choice(scaffolds)
            
            # Generate molecular properties
            mol = Chem.MolFromSmiles(base_scaffold)
            if mol is None:
                continue
                
            mw = Descriptors.MolWt(mol)
            logp = Descriptors.MolLogP(mol)
            hbd = Descriptors.NumHDonors(mol)
            hba = Descriptors.NumHAcceptors(mol)
            
            # Simulate binding affinity (IC50 in μM)
            # Better properties generally lead to better binding
            activity_score = 0
            
            # Molecular weight preference (300-500 Da)
            if 300 <= mw <= 500:
                activity_score += 2
            elif mw > 600:
                activity_score -= 3
                
            # LogP preference (1-4)
            if 1 <= logp <= 4:
                activity_score += 2
            elif logp > 5:
                activity_score -= 2
                
            # H-bond donors/acceptors
            if hbd <= 3 and hba <= 8:
                activity_score += 1
                
            # Add random variation
            activity_score += np.random.normal(0, 2)
            
            # Convert to IC50 (lower is better)
            ic50 = 10 ** (2 - activity_score) + np.random.lognormal(0, 0.5)
            
            compounds.append({
                'smiles': base_scaffold,
                'molecular_weight': mw,
                'logp': logp,
                'hbd': hbd,
                'hba': hba,
                'ic50_um': ic50,
                'pic50': -np.log10(ic50 * 1e-6)  # Convert to pIC50
            })
            
        return pd.DataFrame(compounds)
    
    def analyze_structure_activity(self, df):
        """Analyze structure-activity relationships"""
        
        # Create activity classification
        df['activity_class'] = pd.cut(df['pic50'], 
                                    bins=[0, 5, 6, 7, 12], 
                                    labels=['Inactive', 'Weak', 'Moderate', 'Strong'])
        
        # Statistical analysis
        correlations = df[['molecular_weight', 'logp', 'hbd', 'hba', 'pic50']].corr()
        
        # Visualizations
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # MW vs Activity
        axes[0,0].scatter(df['molecular_weight'], df['pic50'], alpha=0.6)
        axes[0,0].set_xlabel('Molecular Weight (Da)')
        axes[0,0].set_ylabel('pIC50')
        axes[0,0].set_title('Molecular Weight vs Activity')
        
        # LogP vs Activity  
        axes[0,1].scatter(df['logp'], df['pic50'], alpha=0.6)
        axes[0,1].set_xlabel('LogP')
        axes[0,1].set_ylabel('pIC50')
        axes[0,1].set_title('Lipophilicity vs Activity')
        
        # Activity distribution
        df['activity_class'].value_counts().plot(kind='bar', ax=axes[1,0])
        axes[1,0].set_title('Activity Distribution')
        axes[1,0].tick_params(axis='x', rotation=45)
        
        # Correlation heatmap
        sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0, ax=axes[1,1])
        axes[1,1].set_title('Property Correlations')
        
        plt.tight_layout()
        plt.show()
        
        return correlations
    
    def virtual_screening_workflow(self, df, test_size=0.2):
        """Implement virtual screening workflow"""
        
        from sklearn.model_selection import train_test_split
        from sklearn.preprocessing import StandardScaler
        
        # Prepare features
        features = ['molecular_weight', 'logp', 'hbd', 'hba']
        X = df[features]
        y = df['pic50']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_scaled)
        
        # Evaluate
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Feature importance
        importance = pd.DataFrame({
            'feature': features,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Prediction vs Actual
        axes[0].scatter(y_test, y_pred, alpha=0.6)
        axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        axes[0].set_xlabel('Actual pIC50')
        axes[0].set_ylabel('Predicted pIC50')
        axes[0].set_title(f'Predictions (R² = {r2:.3f})')
        
        # Feature importance
        importance.plot(x='feature', y='importance', kind='bar', ax=axes[1])
        axes[1].set_title('Feature Importance')
        axes[1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        return {
            'model': model,
            'scaler': scaler,
            'mse': mse,
            'r2': r2,
            'feature_importance': importance
        }

# Task 1 Implementation
print("=== Task 1: COVID Drug Discovery Case Study ===")

# Initialize case study
covid_study = COVIDDrugDiscovery()

# Generate Mpro inhibitor dataset
print("\n1. Generating Mpro inhibitor dataset...")
mpro_data = covid_study.generate_mpro_dataset(800)
print(f"Generated {len(mpro_data)} compounds")
print("\nDataset preview:")
print(mpro_data.head())

print("\nBasic statistics:")
print(mpro_data.describe())

In [None]:
# Continue Task 1 execution
print("\n2. Analyzing structure-activity relationships...")
correlations = covid_study.analyze_structure_activity(mpro_data)

print("\nKey correlations with activity (pIC50):")
activity_corr = correlations['pic50'].abs().sort_values(ascending=False)
print(activity_corr[activity_corr.index != 'pic50'])

print("\n3. Running virtual screening workflow...")
screening_results = covid_study.virtual_screening_workflow(mpro_data)

print(f"\nVirtual Screening Results:")
print(f"R² Score: {screening_results['r2']:.3f}")
print(f"RMSE: {np.sqrt(screening_results['mse']):.3f}")
print("\nTop features:")
print(screening_results['feature_importance'].head())

# Update progress
task_scores['task_1_covid_case_study'] = 25
skills_developed['case_study_analysis'] = True
tasks_completed += 1
current_score += 25

print(f"\n✓ Task 1 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 2: Cancer Drug Design Case Study (25 points)

Design kinase inhibitors for cancer treatment using multi-target optimization.

In [None]:
class CancerDrugDesign:
    """Cancer drug design focusing on kinase inhibitors"""
    
    def __init__(self):
        self.target_kinases = ['EGFR', 'HER2', 'ALK', 'ROS1', 'MET']
        self.compound_library = []
        self.selectivity_profiles = {}
        self.toxicity_predictions = {}
    
    def generate_kinase_inhibitors(self, n_compounds=500):
        """Generate kinase inhibitor library"""
        
        # Kinase inhibitor scaffolds
        scaffolds = {
            'quinazoline': 'Nc1ncnc2ccccc12',  # EGFR-like
            'pyrimidine': 'Nc1nccc(n1)c2ccccc2',  # Multi-kinase
            'indole': 'c1ccc2[nH]ccc2c1',  # General kinase
            'benzimidazole': 'c1ccc2[nH]cnc2c1',  # ALK-like
            'triazine': 'c1nc(nc(n1)N)N'  # Broad spectrum
        }
        
        compounds = []
        
        for i in range(n_compounds):
            scaffold_name = np.random.choice(list(scaffolds.keys()))
            base_smiles = scaffolds[scaffold_name]
            
            # Generate molecular properties
            mol = Chem.MolFromSmiles(base_smiles)
            if mol is None:
                continue
                
            mw = Descriptors.MolWt(mol) + np.random.normal(150, 50)
            logp = Descriptors.MolLogP(mol) + np.random.normal(1, 0.8)
            hbd = Descriptors.NumHDonors(mol) + np.random.randint(0, 3)
            hba = Descriptors.NumHAcceptors(mol) + np.random.randint(1, 4)
            rotbonds = np.random.randint(2, 8)
            
            # Simulate kinase activities (IC50 in nM)
            activities = {}
            for kinase in self.target_kinases:
                # Different scaffolds have different selectivity patterns
                base_activity = np.random.lognormal(2, 1.5)  # Base IC50
                
                if scaffold_name == 'quinazoline' and kinase in ['EGFR', 'HER2']:
                    base_activity *= 0.1  # More potent against EGFR family
                elif scaffold_name == 'benzimidazole' and kinase == 'ALK':
                    base_activity *= 0.05  # Selective for ALK
                elif scaffold_name == 'pyrimidine':
                    base_activity *= 0.3  # Multi-kinase activity
                    
                activities[kinase] = base_activity
            
            # Calculate selectivity metrics
            ic50_values = list(activities.values())
            best_ic50 = min(ic50_values)
            selectivity_ratio = np.median(ic50_values) / best_ic50
            
            # Simulate ADMET properties
            clogp = logp + np.random.normal(0, 0.5)
            solubility = -0.5 * clogp + np.random.normal(0, 0.8)
            permeability = logp * 0.3 + np.random.normal(0, 0.4)
            cytotoxicity = np.random.lognormal(1, 0.8)  # CC50 in μM
            
            compounds.append({
                'compound_id': f'KI_{i:04d}',
                'scaffold': scaffold_name,
                'smiles': base_smiles,
                'molecular_weight': mw,
                'logp': logp,
                'clogp': clogp,
                'hbd': hbd,
                'hba': hba,
                'rotatable_bonds': rotbonds,
                'best_ic50_nm': best_ic50,
                'selectivity_ratio': selectivity_ratio,
                'solubility_log': solubility,
                'permeability': permeability,
                'cytotoxicity_um': cytotoxicity,
                **{f'{kinase}_ic50_nm': activities[kinase] for kinase in self.target_kinases}
            })
            
        return pd.DataFrame(compounds)
    
    def multi_parameter_optimization(self, df):
        """Optimize compounds using multi-parameter criteria"""
        
        # Define scoring criteria
        scores = pd.DataFrame(index=df.index)
        
        # Potency score (based on best IC50)
        scores['potency'] = np.where(df['best_ic50_nm'] <= 10, 5,
                           np.where(df['best_ic50_nm'] <= 50, 4,
                           np.where(df['best_ic50_nm'] <= 100, 3,
                           np.where(df['best_ic50_nm'] <= 500, 2, 1))))
        
        # Selectivity score
        scores['selectivity'] = np.where(df['selectivity_ratio'] >= 100, 5,
                               np.where(df['selectivity_ratio'] >= 50, 4,
                               np.where(df['selectivity_ratio'] >= 20, 3,
                               np.where(df['selectivity_ratio'] >= 10, 2, 1))))
        
        # Drug-likeness score (Lipinski-like)
        scores['druglikeness'] = (
            (df['molecular_weight'] <= 500).astype(int) +
            (df['logp'] <= 5).astype(int) +
            (df['hbd'] <= 5).astype(int) +
            (df['hba'] <= 10).astype(int) +
            (df['rotatable_bonds'] <= 7).astype(int)
        )
        
        # ADMET score
        scores['admet'] = (
            (df['solubility_log'] >= -4).astype(int) * 2 +  # Solubility
            (df['permeability'] >= 0).astype(int) * 2 +     # Permeability
            (df['cytotoxicity_um'] >= 10).astype(int) * 1   # Low cytotoxicity
        )
        
        # Calculate composite score
        weights = {'potency': 0.3, 'selectivity': 0.25, 'druglikeness': 0.25, 'admet': 0.2}
        
        scores['composite'] = (
            scores['potency'] * weights['potency'] +
            scores['selectivity'] * weights['selectivity'] +
            scores['druglikeness'] * weights['druglikeness'] +
            scores['admet'] * weights['admet']
        )
        
        # Add scores to dataframe
        result_df = df.copy()
        for col in scores.columns:
            result_df[f'{col}_score'] = scores[col]
            
        return result_df.sort_values('composite_score', ascending=False)
    
    def analyze_optimization_results(self, df):
        """Analyze multi-parameter optimization results"""
        
        # Top compounds
        top_10 = df.head(10)
        
        # Visualizations
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Score distribution
        score_cols = ['potency_score', 'selectivity_score', 'druglikeness_score', 'admet_score']
        df[score_cols].hist(bins=5, ax=axes[0,0], alpha=0.7)
        axes[0,0].set_title('Score Distributions')
        axes[0,0].legend(score_cols)
        
        # Composite score vs best IC50
        scatter = axes[0,1].scatter(df['best_ic50_nm'], df['composite_score'], 
                                  c=df['selectivity_ratio'], cmap='viridis', alpha=0.6)
        axes[0,1].set_xlabel('Best IC50 (nM)')
        axes[0,1].set_ylabel('Composite Score')
        axes[0,1].set_xscale('log')
        axes[0,1].set_title('Optimization Landscape')
        plt.colorbar(scatter, ax=axes[0,1], label='Selectivity Ratio')
        
        # Scaffold analysis
        scaffold_scores = df.groupby('scaffold')['composite_score'].mean().sort_values(ascending=False)
        scaffold_scores.plot(kind='bar', ax=axes[1,0])
        axes[1,0].set_title('Average Composite Score by Scaffold')
        axes[1,0].tick_params(axis='x', rotation=45)
        
        # Property space coverage
        axes[1,1].scatter(df['molecular_weight'], df['logp'], 
                         c=df['composite_score'], cmap='RdYlBu_r', alpha=0.6)
        axes[1,1].set_xlabel('Molecular Weight')
        axes[1,1].set_ylabel('LogP')
        axes[1,1].set_title('Property Space Coverage')
        
        plt.tight_layout()
        plt.show()
        
        return top_10[['compound_id', 'scaffold', 'best_ic50_nm', 'selectivity_ratio', 
                      'potency_score', 'selectivity_score', 'druglikeness_score', 
                      'admet_score', 'composite_score']]

# Task 2 Implementation
print("\n=== Task 2: Cancer Drug Design Case Study ===")

# Initialize cancer drug design
cancer_design = CancerDrugDesign()

# Generate kinase inhibitor library
print("\n1. Generating kinase inhibitor library...")
kinase_data = cancer_design.generate_kinase_inhibitors(400)
print(f"Generated {len(kinase_data)} kinase inhibitors")
print("\nLibrary preview:")
print(kinase_data[['compound_id', 'scaffold', 'molecular_weight', 'best_ic50_nm', 'selectivity_ratio']].head())

In [None]:
# Continue Task 2 execution
print("\n2. Running multi-parameter optimization...")
optimized_compounds = cancer_design.multi_parameter_optimization(kinase_data)

print("\n3. Analyzing optimization results...")
top_compounds = cancer_design.analyze_optimization_results(optimized_compounds)

print("\nTop 5 Optimized Compounds:")
print(top_compounds.head().round(3))

print("\nOptimization Summary:")
print(f"Total compounds evaluated: {len(optimized_compounds)}")
print(f"Compounds with composite score > 3.0: {len(optimized_compounds[optimized_compounds['composite_score'] > 3.0])}")
print(f"Best compound score: {optimized_compounds['composite_score'].max():.3f}")
print(f"Median selectivity ratio: {optimized_compounds['selectivity_ratio'].median():.1f}")

# Scaffold performance analysis
print("\nScaffold Performance:")
scaffold_perf = optimized_compounds.groupby('scaffold').agg({
    'composite_score': 'mean',
    'best_ic50_nm': 'median',
    'selectivity_ratio': 'median'
}).round(3)
print(scaffold_perf)

# Update progress
task_scores['task_2_cancer_drug_design'] = 25
skills_developed['integrated_workflows'] = True
tasks_completed += 1
current_score += 25

print(f"\n✓ Task 2 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 3: Personalized Medicine Applications (25 points)

Develop pharmacogenomics-based drug selection and dosing strategies.

In [None]:
class PersonalizedMedicine:
    """Pharmacogenomics-based personalized medicine"""
    
    def __init__(self):
        self.genetic_variants = {}
        self.drug_responses = {}
        self.dosing_algorithms = {}
        
    def simulate_patient_cohort(self, n_patients=300):
        """Generate patient cohort with genetic variants"""
        
        # Key pharmacogenomic variants
        variants = {
            'CYP2D6': ['*1/*1', '*1/*2', '*1/*4', '*4/*4', '*2/*2'],  # Metabolizer status
            'CYP2C19': ['*1/*1', '*1/*2', '*1/*3', '*2/*2', '*2/*3'],
            'SLCO1B1': ['*1/*1', '*1/*5', '*5/*5'],  # Statin response
            'VKORC1': ['GG', 'GA', 'AA'],  # Warfarin sensitivity
            'DPYD': ['*1/*1', '*1/*2A', '*2A/*2A'],  # 5-FU toxicity
            'UGT1A1': ['*1/*1', '*1/*28', '*28/*28']  # Irinotecan toxicity
        }
        
        # Population frequencies (simplified)
        frequencies = {
            'CYP2D6': [0.35, 0.25, 0.20, 0.10, 0.10],
            'CYP2C19': [0.40, 0.25, 0.15, 0.15, 0.05],
            'SLCO1B1': [0.70, 0.25, 0.05],
            'VKORC1': [0.25, 0.50, 0.25],
            'DPYD': [0.85, 0.14, 0.01],
            'UGT1A1': [0.60, 0.35, 0.05]
        }
        
        patients = []
        
        for i in range(n_patients):
            patient = {'patient_id': f'PT_{i:04d}'}
            
            # Assign genetic variants
            for gene in variants:
                patient[f'{gene}_genotype'] = np.random.choice(
                    variants[gene], p=frequencies[gene]
                )
            
            # Demographics
            patient['age'] = np.random.randint(25, 80)
            patient['weight'] = np.random.normal(70, 15)
            patient['sex'] = np.random.choice(['M', 'F'])
            patient['ethnicity'] = np.random.choice(['Caucasian', 'African', 'Asian', 'Hispanic'], 
                                                  p=[0.60, 0.15, 0.15, 0.10])
            
            # Comorbidities
            patient['diabetes'] = np.random.random() < 0.15
            patient['hypertension'] = np.random.random() < 0.25
            patient['kidney_disease'] = np.random.random() < 0.08
            patient['liver_disease'] = np.random.random() < 0.05
            
            patients.append(patient)
            
        return pd.DataFrame(patients)
    
    def predict_drug_response(self, patients_df, drug='warfarin'):
        """Predict drug response based on pharmacogenomics"""
        
        responses = []
        
        for _, patient in patients_df.iterrows():
            if drug == 'warfarin':
                response = self._predict_warfarin_dose(patient)
            elif drug == 'clopidogrel':
                response = self._predict_clopidogrel_response(patient)
            elif drug == 'simvastatin':
                response = self._predict_statin_response(patient)
            else:
                response = {'predicted_response': 'unknown'}
                
            response['patient_id'] = patient['patient_id']
            responses.append(response)
            
        return pd.DataFrame(responses)
    
    def _predict_warfarin_dose(self, patient):
        """VKORC1/CYP2C19-based warfarin dosing"""
        
        # Base dose calculation
        base_dose = 5.0  # mg/day
        
        # VKORC1 effect (main effect)
        vkorc1 = patient['VKORC1_genotype']
        if vkorc1 == 'AA':
            vkorc1_factor = 0.6  # Low dose needed
        elif vkorc1 == 'GA':
            vkorc1_factor = 0.8  # Intermediate
        else:  # GG
            vkorc1_factor = 1.0  # Standard dose
            
        # CYP2C19 effect (metabolism)
        cyp2c19 = patient['CYP2C19_genotype']
        if '*2' in cyp2c19 or '*3' in cyp2c19:
            cyp_factor = 1.2  # Poor metabolizer, needs higher dose
        else:
            cyp_factor = 1.0
            
        # Age effect
        age_factor = 1.0 - (patient['age'] - 40) * 0.01 if patient['age'] > 40 else 1.0
        
        # Weight effect
        weight_factor = patient['weight'] / 70.0
        
        predicted_dose = base_dose * vkorc1_factor * cyp_factor * age_factor * weight_factor
        predicted_dose = max(1.0, min(15.0, predicted_dose))  # Safety bounds
        
        # Risk assessment
        if vkorc1 == 'AA' or '*2' in cyp2c19:
            risk_category = 'High risk - start low, monitor closely'
        elif vkorc1 == 'GA':
            risk_category = 'Intermediate risk - standard monitoring'
        else:
            risk_category = 'Standard risk - routine monitoring'
            
        return {
            'predicted_dose_mg': round(predicted_dose, 1),
            'risk_category': risk_category,
            'vkorc1_effect': vkorc1_factor,
            'cyp2c19_effect': cyp_factor
        }
    
    def _predict_clopidogrel_response(self, patient):
        """CYP2C19-based clopidogrel response"""
        
        cyp2c19 = patient['CYP2C19_genotype']
        
        if '*2' in cyp2c19 or '*3' in cyp2c19:
            if cyp2c19 in ['*2/*2', '*2/*3']:
                response = 'Poor metabolizer - consider alternative'
                efficacy = 0.3
            else:
                response = 'Intermediate metabolizer - monitor closely'
                efficacy = 0.6
        else:
            response = 'Normal metabolizer - standard therapy'
            efficacy = 0.9
            
        return {
            'predicted_response': response,
            'predicted_efficacy': efficacy,
            'alternative_recommended': '*2' in cyp2c19 or '*3' in cyp2c19
        }
    
    def _predict_statin_response(self, patient):
        """SLCO1B1-based statin response"""
        
        slco1b1 = patient['SLCO1B1_genotype']
        
        if '*5' in slco1b1:
            if slco1b1 == '*5/*5':
                myopathy_risk = 0.25
                recommendation = 'High risk - use lowest dose or alternative'
            else:
                myopathy_risk = 0.10
                recommendation = 'Intermediate risk - start low dose'
        else:
            myopathy_risk = 0.02
            recommendation = 'Standard risk - normal dosing'
            
        return {
            'myopathy_risk': myopathy_risk,
            'recommendation': recommendation,
            'dose_reduction_needed': '*5' in slco1b1
        }
    
    def analyze_population_responses(self, patients_df, responses_df, drug):
        """Analyze population-level pharmacogenomic patterns"""
        
        # Merge data
        merged = patients_df.merge(responses_df, on='patient_id')
        
        if drug == 'warfarin':
            self._analyze_warfarin_population(merged)
        elif drug == 'clopidogrel':
            self._analyze_clopidogrel_population(merged)
        elif drug == 'simvastatin':
            self._analyze_statin_population(merged)
    
    def _analyze_warfarin_population(self, data):
        """Analyze warfarin dosing patterns"""
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Dose by VKORC1 genotype
        vkorc1_groups = data.groupby('VKORC1_genotype')['predicted_dose_mg']
        vkorc1_groups.apply(list).apply(lambda x: axes[0,0].hist(x, alpha=0.7, label=x.name))
        axes[0,0].set_xlabel('Predicted Dose (mg/day)')
        axes[0,0].set_ylabel('Frequency')
        axes[0,0].set_title('Warfarin Dose Distribution by VKORC1')
        axes[0,0].legend()
        
        # Risk category distribution
        risk_counts = data['risk_category'].value_counts()
        risk_counts.plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
        axes[0,1].set_title('Risk Category Distribution')
        
        # Age vs dose colored by genotype
        for genotype in data['VKORC1_genotype'].unique():
            subset = data[data['VKORC1_genotype'] == genotype]
            axes[1,0].scatter(subset['age'], subset['predicted_dose_mg'], 
                            label=genotype, alpha=0.6)
        axes[1,0].set_xlabel('Age')
        axes[1,0].set_ylabel('Predicted Dose (mg/day)')
        axes[1,0].set_title('Age vs Dose by VKORC1 Genotype')
        axes[1,0].legend()
        
        # Summary statistics
        summary = data.groupby('VKORC1_genotype')['predicted_dose_mg'].agg(['mean', 'std', 'count'])
        summary.plot(kind='bar', y='mean', yerr='std', ax=axes[1,1])
        axes[1,1].set_title('Mean Dose by VKORC1 Genotype')
        axes[1,1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        print("Warfarin Dosing Summary:")
        print(summary.round(2))

# Task 3 Implementation
print("\n=== Task 3: Personalized Medicine Applications ===")

# Initialize personalized medicine analysis
personalized_med = PersonalizedMedicine()

# Generate patient cohort
print("\n1. Generating patient cohort with genetic variants...")
patients = personalized_med.simulate_patient_cohort(250)
print(f"Generated {len(patients)} patients")
print("\nCohort preview:")
print(patients[['patient_id', 'age', 'weight', 'sex', 'VKORC1_genotype', 'CYP2C19_genotype']].head())

In [None]:
# Continue Task 3 execution
print("\n2. Predicting warfarin responses...")
warfarin_responses = personalized_med.predict_drug_response(patients, 'warfarin')
print("\nWarfarin response preview:")
print(warfarin_responses[['patient_id', 'predicted_dose_mg', 'risk_category']].head())

print("\n3. Analyzing population pharmacogenomic patterns...")
personalized_med.analyze_population_responses(patients, warfarin_responses, 'warfarin')

print("\n4. Clopidogrel response analysis...")
clopidogrel_responses = personalized_med.predict_drug_response(patients, 'clopidogrel')

# Analyze CYP2C19 impact
cyp_analysis = patients.merge(clopidogrel_responses, on='patient_id')
cyp_impact = cyp_analysis.groupby('CYP2C19_genotype').agg({
    'predicted_efficacy': 'mean',
    'alternative_recommended': 'sum'
}).round(3)

print("\nCYP2C19 Impact on Clopidogrel:")
print(cyp_impact)

print("\n5. Statin myopathy risk assessment...")
statin_responses = personalized_med.predict_drug_response(patients, 'simvastatin')

# Risk stratification
risk_analysis = patients.merge(statin_responses, on='patient_id')
high_risk = risk_analysis[risk_analysis['myopathy_risk'] > 0.1]

print(f"\nStatin Risk Assessment:")
print(f"Total patients: {len(patients)}")
print(f"High risk patients (>10% myopathy risk): {len(high_risk)}")
print(f"Percentage requiring dose modification: {len(high_risk)/len(patients)*100:.1f}%")

print("\nSLCO1B1 genotype distribution:")
print(patients['SLCO1B1_genotype'].value_counts())

# Clinical impact summary
print("\n=== Clinical Impact Summary ===")
print(f"Warfarin high-risk patients: {len(warfarin_responses[warfarin_responses['risk_category'].str.contains('High')])}")
print(f"Clopidogrel poor metabolizers: {len(clopidogrel_responses[clopidogrel_responses['alternative_recommended']])}")
print(f"Statin high-risk patients: {len(statin_responses[statin_responses['myopathy_risk'] > 0.1])}")

# Update progress
task_scores['task_3_personalized_medicine'] = 25
skills_developed['personalized_approaches'] = True
tasks_completed += 1
current_score += 25

print(f"\n✓ Task 3 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 4: Regulatory Compliance Modeling (25 points)

Implement computational models for regulatory submission and safety assessment.

In [None]:
class RegulatoryModeling:
    """Computational models for regulatory compliance"""
    
    def __init__(self):
        self.safety_models = {}
        self.efficacy_models = {}
        self.submission_data = {}
        
    def generate_clinical_trial_data(self, n_patients=500):
        """Generate synthetic clinical trial data"""
        
        # Patient demographics
        patients = []
        for i in range(n_patients):
            patient = {
                'patient_id': f'CT_{i:04d}',
                'age': np.random.normal(55, 15),
                'weight': np.random.normal(75, 12),
                'sex': np.random.choice(['M', 'F']),
                'baseline_severity': np.random.uniform(3, 8),  # Disease severity score
                'treatment_arm': np.random.choice(['Active', 'Placebo'], p=[0.6, 0.4])
            }
            
            # Simulate treatment response
            if patient['treatment_arm'] == 'Active':
                # Drug effect with individual variation
                drug_effect = np.random.normal(-2.5, 1.0)  # Improvement in severity
                response_prob = 0.65 + np.random.normal(0, 0.1)
            else:
                drug_effect = np.random.normal(-0.5, 0.8)  # Placebo effect
                response_prob = 0.25 + np.random.normal(0, 0.1)
                
            patient['final_severity'] = max(0, patient['baseline_severity'] + drug_effect)
            patient['response'] = np.random.random() < response_prob
            patient['improvement'] = patient['baseline_severity'] - patient['final_severity']
            
            # Safety events
            if patient['treatment_arm'] == 'Active':
                ae_prob = 0.15 + (patient['age'] - 50) * 0.002  # Age increases AE risk
            else:
                ae_prob = 0.08
                
            patient['adverse_event'] = np.random.random() < ae_prob
            patient['serious_ae'] = patient['adverse_event'] and np.random.random() < 0.1
            
            # Dropout
            dropout_prob = 0.1 if patient['treatment_arm'] == 'Active' else 0.15
            if patient['adverse_event']:
                dropout_prob += 0.2
            patient['completed_study'] = np.random.random() > dropout_prob
            
            patients.append(patient)
            
        return pd.DataFrame(patients)
    
    def efficacy_analysis(self, trial_data):
        """Perform regulatory-standard efficacy analysis"""
        
        # Primary efficacy endpoint: response rate
        active_group = trial_data[trial_data['treatment_arm'] == 'Active']
        placebo_group = trial_data[trial_data['treatment_arm'] == 'Placebo']
        
        # Response rates
        active_response_rate = active_group['response'].mean()
        placebo_response_rate = placebo_group['response'].mean()
        
        # Statistical test (Chi-square)
        from scipy.stats import chi2_contingency
        
        contingency_table = pd.crosstab(trial_data['treatment_arm'], trial_data['response'])
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        
        # Effect size and confidence interval
        active_n = len(active_group)
        placebo_n = len(placebo_group)
        
        # Wilson score interval for proportions
        def wilson_ci(successes, total, confidence=0.95):
            z = 1.96  # 95% CI
            p = successes / total
            n = total
            
            center = (p + z**2 / (2*n)) / (1 + z**2 / n)
            margin = z * np.sqrt((p*(1-p) + z**2/(4*n)) / n) / (1 + z**2 / n)
            
            return center - margin, center + margin
        
        active_ci = wilson_ci(active_group['response'].sum(), active_n)
        placebo_ci = wilson_ci(placebo_group['response'].sum(), placebo_n)
        
        # Secondary endpoints
        active_improvement = active_group['improvement'].mean()
        placebo_improvement = placebo_group['improvement'].mean()
        
        from scipy.stats import ttest_ind
        t_stat, t_p_value = ttest_ind(active_group['improvement'], placebo_group['improvement'])
        
        results = {
            'primary_endpoint': {
                'active_response_rate': active_response_rate,
                'placebo_response_rate': placebo_response_rate,
                'difference': active_response_rate - placebo_response_rate,
                'p_value': p_value,
                'active_ci': active_ci,
                'placebo_ci': placebo_ci,
                'significant': p_value < 0.05
            },
            'secondary_endpoint': {
                'active_improvement': active_improvement,
                'placebo_improvement': placebo_improvement,
                'difference': active_improvement - placebo_improvement,
                'p_value': t_p_value,
                'significant': t_p_value < 0.05
            }
        }
        
        return results
    
    def safety_analysis(self, trial_data):
        """Perform regulatory-standard safety analysis"""
        
        active_group = trial_data[trial_data['treatment_arm'] == 'Active']
        placebo_group = trial_data[trial_data['treatment_arm'] == 'Placebo']
        
        # Adverse event rates
        safety_results = {
            'adverse_events': {
                'active_rate': active_group['adverse_event'].mean(),
                'placebo_rate': placebo_group['adverse_event'].mean(),
                'active_count': active_group['adverse_event'].sum(),
                'placebo_count': placebo_group['adverse_event'].sum()
            },
            'serious_adverse_events': {
                'active_rate': active_group['serious_ae'].mean(),
                'placebo_rate': placebo_group['serious_ae'].mean(),
                'active_count': active_group['serious_ae'].sum(),
                'placebo_count': placebo_group['serious_ae'].sum()
            },
            'discontinuations': {
                'active_rate': 1 - active_group['completed_study'].mean(),
                'placebo_rate': 1 - placebo_group['completed_study'].mean()
            }
        }
        
        # Risk-benefit assessment
        efficacy_benefit = active_group['response'].mean() - placebo_group['response'].mean()
        safety_risk = active_group['adverse_event'].mean() - placebo_group['adverse_event'].mean()
        
        # Number needed to treat and harm
        nnt = 1 / efficacy_benefit if efficacy_benefit > 0 else np.inf
        nnh = 1 / safety_risk if safety_risk > 0 else np.inf
        
        safety_results['risk_benefit'] = {
            'efficacy_benefit': efficacy_benefit,
            'safety_risk': safety_risk,
            'nnt': nnt,
            'nnh': nnh,
            'benefit_risk_ratio': efficacy_benefit / safety_risk if safety_risk > 0 else np.inf
        }
        
        return safety_results
    
    def regulatory_submission_summary(self, efficacy_results, safety_results, trial_data):
        """Generate regulatory submission summary"""
        
        total_patients = len(trial_data)
        active_patients = len(trial_data[trial_data['treatment_arm'] == 'Active'])
        
        print("=== REGULATORY SUBMISSION SUMMARY ===")
        print(f"\nSTUDY OVERVIEW:")
        print(f"Total patients enrolled: {total_patients}")
        print(f"Active treatment: {active_patients}")
        print(f"Placebo: {total_patients - active_patients}")
        
        print(f"\nPRIMARY EFFICACY ENDPOINT:")
        pe = efficacy_results['primary_endpoint']
        print(f"Active response rate: {pe['active_response_rate']:.1%} (95% CI: {pe['active_ci'][0]:.1%}-{pe['active_ci'][1]:.1%})")
        print(f"Placebo response rate: {pe['placebo_response_rate']:.1%} (95% CI: {pe['placebo_ci'][0]:.1%}-{pe['placebo_ci'][1]:.1%})")
        print(f"Treatment difference: {pe['difference']:.1%}")
        print(f"P-value: {pe['p_value']:.4f}")
        print(f"Statistical significance: {'YES' if pe['significant'] else 'NO'}")
        
        print(f"\nSECONDARY EFFICACY ENDPOINT:")
        se = efficacy_results['secondary_endpoint']
        print(f"Mean improvement - Active: {se['active_improvement']:.2f}")
        print(f"Mean improvement - Placebo: {se['placebo_improvement']:.2f}")
        print(f"Treatment difference: {se['difference']:.2f}")
        print(f"P-value: {se['p_value']:.4f}")
        
        print(f"\nSAFETY PROFILE:")
        ae = safety_results['adverse_events']
        sae = safety_results['serious_adverse_events']
        disc = safety_results['discontinuations']
        
        print(f"Adverse events - Active: {ae['active_rate']:.1%} ({ae['active_count']} patients)")
        print(f"Adverse events - Placebo: {ae['placebo_rate']:.1%} ({ae['placebo_count']} patients)")
        print(f"Serious AEs - Active: {sae['active_rate']:.1%} ({sae['active_count']} patients)")
        print(f"Serious AEs - Placebo: {sae['placebo_rate']:.1%} ({sae['placebo_count']} patients)")
        print(f"Discontinuation rate - Active: {disc['active_rate']:.1%}")
        print(f"Discontinuation rate - Placebo: {disc['placebo_rate']:.1%}")
        
        print(f"\nRISK-BENEFIT ASSESSMENT:")
        rb = safety_results['risk_benefit']
        print(f"Number needed to treat (NNT): {rb['nnt']:.1f}")
        print(f"Number needed to harm (NNH): {rb['nnh']:.1f}")
        print(f"Benefit-risk ratio: {rb['benefit_risk_ratio']:.2f}")
        
        # Regulatory decision framework
        approval_criteria = {
            'efficacy_significant': pe['significant'],
            'clinically_meaningful': pe['difference'] > 0.15,
            'acceptable_safety': ae['active_rate'] < 0.25,
            'positive_benefit_risk': rb['benefit_risk_ratio'] > 2.0
        }
        
        meets_criteria = sum(approval_criteria.values())
        total_criteria = len(approval_criteria)
        
        print(f"\nREGULATORY ASSESSMENT:")
        for criterion, met in approval_criteria.items():
            print(f"{criterion}: {'PASS' if met else 'FAIL'}")
        
        print(f"\nOVERALL RECOMMENDATION:")
        if meets_criteria >= 3:
            recommendation = "APPROVAL RECOMMENDED"
        elif meets_criteria >= 2:
            recommendation = "CONDITIONAL APPROVAL - Additional data needed"
        else:
            recommendation = "APPROVAL NOT RECOMMENDED"
            
        print(f"{recommendation} ({meets_criteria}/{total_criteria} criteria met)")
        
        return {
            'approval_criteria': approval_criteria,
            'criteria_met': meets_criteria,
            'recommendation': recommendation
        }

# Task 4 Implementation
print("\n=== Task 4: Regulatory Compliance Modeling ===")

# Initialize regulatory modeling
regulatory = RegulatoryModeling()

# Generate clinical trial data
print("\n1. Generating clinical trial dataset...")
trial_data = regulatory.generate_clinical_trial_data(400)
print(f"Generated trial data for {len(trial_data)} patients")
print("\nTrial data preview:")
print(trial_data[['patient_id', 'age', 'treatment_arm', 'baseline_severity', 'response', 'adverse_event']].head())

print("\n2. Performing efficacy analysis...")
efficacy_results = regulatory.efficacy_analysis(trial_data)

print("\n3. Performing safety analysis...")
safety_results = regulatory.safety_analysis(trial_data)

print("\n4. Generating regulatory submission summary...")
submission = regulatory.regulatory_submission_summary(efficacy_results, safety_results, trial_data)

# Update final progress
task_scores['task_4_regulatory_modeling'] = 25
skills_developed['regulatory_compliance'] = True
tasks_completed += 1
current_score += 25

print(f"\n✓ Task 4 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

# Week 9 Summary
print("\n" + "="*50)
print("WEEK 9 CHECKPOINT COMPLETED!")
print("="*50)
print(f"Final Score: {current_score}/{total_points} points")
print(f"\nTasks Completed:")
for task, score in task_scores.items():
    print(f"  {task}: {score}/25 points")
    
print(f"\nSkills Developed:")
for skill, developed in skills_developed.items():
    print(f"  {skill}: {'✓' if developed else '✗'}")
    
print(f"\nNext Steps: Proceed to Week 10 - Integration and Validation")