In [None]:
# ChemML Integration Setupimport chemmlprint(f'üß™ ChemML {chemml.__version__} loaded for this notebook')

# Week 10 Checkpoint: Integration and Validation

## Learning Objectives
- Integrate multiple computational approaches into unified workflows
- Validate computational predictions against experimental data
- Develop comprehensive model evaluation frameworks
- Implement cross-validation and uncertainty quantification

## Progress Tracking Variables

In [None]:
# Week 10 Progress Tracking
week_number = 10
week_topic = "Integration and Validation"
total_points = 100
tasks_completed = 0
current_score = 0

# Task completion tracking
task_scores = {
    'task_1_workflow_integration': 0,
    'task_2_experimental_validation': 0,
    'task_3_uncertainty_quantification': 0,
    'task_4_model_evaluation': 0
}

# Skills assessment
skills_developed = {
    'workflow_design': False,
    'validation_strategies': False,
    'uncertainty_analysis': False,
    'model_evaluation': False
}

print(f"Week {week_number}: {week_topic}")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 1: Workflow Integration (25 points)

Develop integrated computational workflows combining multiple methodologies.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings
warnings.filterwarnings('ignore')

class IntegratedWorkflow:
    """Integrated computational drug discovery pipeline"""
    
    def __init__(self):
        self.models = {}
        self.feature_importance = {}
        
    def generate_dataset(self, n_compounds=600):
        """Generate comprehensive molecular dataset"""
        
        compounds = []
        for i in range(n_compounds):
            # Simulate molecular properties
            mw = np.random.normal(350, 100)
            logp = np.random.normal(2.5, 1.5)
            hbd = np.random.randint(0, 6)
            hba = np.random.randint(1, 11)
            
            # QM properties
            homo_energy = np.random.normal(-6.0, 1.5)
            lumo_energy = np.random.normal(-1.0, 1.0)
            dipole = np.random.gamma(2, 1.5)
            
            # MD properties  
            sasa = mw * 0.5 + np.random.normal(0, 50)
            rg = np.sqrt(mw/100) + np.random.normal(0, 0.3)
            
            # Target activities
            activity_base = 5.0
            mw_effect = -abs(mw - 400) * 0.002
            logp_effect = -abs(logp - 3) * 0.3
            homo_effect = (homo_energy + 6) * 0.1
            
            activity = activity_base + mw_effect + logp_effect + homo_effect + np.random.normal(0, 0.8)
            
            compounds.append({
                'compound_id': f'COMP_{i:05d}',
                'mw': mw, 'logp': logp, 'hbd': hbd, 'hba': hba,
                'homo_energy': homo_energy, 'lumo_energy': lumo_energy, 'dipole': dipole,
                'sasa': sasa, 'rg': rg,
                'target_activity': max(0, activity)
            })
            
        return pd.DataFrame(compounds)
    
    def build_models(self, data):
        """Build models with different feature combinations"""
        
        feature_sets = {
            'basic': ['mw', 'logp', 'hbd', 'hba'],
            'qm': ['mw', 'logp', 'hbd', 'hba', 'homo_energy', 'lumo_energy', 'dipole'],
            'md': ['mw', 'logp', 'hbd', 'hba', 'sasa', 'rg'],
            'integrated': ['mw', 'logp', 'hbd', 'hba', 'homo_energy', 'lumo_energy', 'dipole', 'sasa', 'rg']
        }
        
        y = data['target_activity'].values
        results = {}
        
        for name, features in feature_sets.items():
            X = data[features].values
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train_scaled, y_train)
            
            y_pred = model.predict(X_test_scaled)
            r2 = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            
            results[name] = {'r2': r2, 'rmse': rmse, 'model': model, 'features': features}
            
        return results
    
    def analyze_integration(self, results):
        """Analyze benefits of integration"""
        
        # Performance comparison
        perf_data = []
        for name, result in results.items():
            perf_data.append({'model': name, 'r2': result['r2'], 'rmse': result['rmse']})
        
        perf_df = pd.DataFrame(perf_data)
        
        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        perf_df.plot(x='model', y='r2', kind='bar', ax=axes[0], color='skyblue')
        axes[0].set_title('Model Performance (R¬≤)')
        axes[0].set_ylabel('R¬≤ Score')
        
        perf_df.plot(x='model', y='rmse', kind='bar', ax=axes[1], color='lightcoral')
        axes[1].set_title('Model Performance (RMSE)')
        axes[1].set_ylabel('RMSE')
        
        plt.tight_layout()
        plt.show()
        
        return perf_df

# Task 1 Implementation
print("=== Task 1: Workflow Integration ===")

workflow = IntegratedWorkflow()

print("\n1. Generating comprehensive dataset...")
data = workflow.generate_dataset(500)
print(f"Generated {len(data)} compounds")
print("\nDataset preview:")
print(data.head())

print("\n2. Building integrated models...")
model_results = workflow.build_models(data)

print("\n3. Analyzing integration benefits...")
performance = workflow.analyze_integration(model_results)

print("\nPerformance Summary:")
print(performance.round(3))

integrated_r2 = model_results['integrated']['r2']
basic_r2 = model_results['basic']['r2']
improvement = integrated_r2 - basic_r2

print(f"\nIntegration benefit: {improvement:.3f} R¬≤ improvement")
print(f"Relative improvement: {(improvement/basic_r2)*100:.1f}%")

In [None]:
# Update progress for Task 1
task_scores['task_1_workflow_integration'] = 25
skills_developed['workflow_design'] = True
tasks_completed += 1
current_score += 25

print(f"\n‚úì Task 1 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 2: Experimental Validation (25 points)

Validate computational predictions against experimental data and develop validation frameworks.

In [None]:
class ExperimentalValidation:
    """Framework for validating computational predictions"""
    
    def __init__(self):
        self.experimental_data = {}
        self.validation_metrics = {}
        
    def generate_experimental_data(self, computational_predictions, noise_level=0.3):
        """Simulate experimental validation data"""
        
        experimental_values = []
        
        for pred in computational_predictions:
            # Systematic bias and noise
            bias = -0.2
            noise = np.random.normal(0, noise_level)
            
            # Occasional outliers
            if np.random.random() < 0.1:
                outlier_factor = np.random.choice([-2, 2]) * np.random.uniform(0.5, 1.5)
                exp_value = pred + bias + noise + outlier_factor
            else:
                exp_value = pred + bias + noise
                
            experimental_values.append(max(0, exp_value))
            
        return np.array(experimental_values)
    
    def validate_predictions(self, predictions, experimental):
        """Comprehensive validation analysis"""
        
        from scipy.stats import pearsonr
        
        # Basic metrics
        r2 = r2_score(experimental, predictions)
        pearson_r, _ = pearsonr(predictions, experimental)
        mae = mean_absolute_error(experimental, predictions)
        rmse = np.sqrt(mean_squared_error(experimental, predictions))
        
        # Outlier detection
        residuals = experimental - predictions
        outlier_threshold = 2 * np.std(residuals)
        outliers = np.abs(residuals) > outlier_threshold
        
        validation_metrics = {
            'r2_score': r2,
            'pearson_correlation': pearson_r,
            'mae': mae,
            'rmse': rmse,
            'outlier_percentage': (outliers.sum() / len(outliers)) * 100
        }
        
        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        # Prediction vs Experimental
        axes[0].scatter(predictions, experimental, alpha=0.6)
        axes[0].plot([min(predictions), max(predictions)], 
                    [min(predictions), max(predictions)], 'r--')
        axes[0].set_xlabel('Predicted Activity')
        axes[0].set_ylabel('Experimental Activity')
        axes[0].set_title(f'Predicted vs Experimental (R¬≤ = {r2:.3f})')
        
        # Residuals plot
        axes[1].scatter(predictions, residuals, alpha=0.6)
        axes[1].axhline(y=0, color='r', linestyle='--')
        axes[1].set_xlabel('Predicted Activity')
        axes[1].set_ylabel('Residuals')
        axes[1].set_title('Residuals Plot')
        
        plt.tight_layout()
        plt.show()
        
        return validation_metrics

# Task 2 Implementation
print("\n=== Task 2: Experimental Validation ===")

validator = ExperimentalValidation()

# Use predictions from integrated model
integrated_model = model_results['integrated']['model']
features = model_results['integrated']['features']

X_test = data[features].values
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
predictions = integrated_model.predict(X_test_scaled)

print("\n1. Generating experimental validation data...")
experimental_data = validator.generate_experimental_data(predictions, noise_level=0.4)
print(f"Generated experimental data for {len(experimental_data)} compounds")

print("\n2. Validating computational predictions...")
validation_results = validator.validate_predictions(predictions, experimental_data)

print("\nValidation Results:")
for metric, value in validation_results.items():
    if isinstance(value, float):
        print(f"{metric}: {value:.3f}")
    else:
        print(f"{metric}: {value}")

# Update progress for Task 2
task_scores['task_2_experimental_validation'] = 25
skills_developed['validation_strategies'] = True
tasks_completed += 1
current_score += 25

print(f"\n‚úì Task 2 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 3: Uncertainty Quantification (25 points)

Implement uncertainty quantification methods for computational predictions.

In [None]:
class UncertaintyQuantification:
    """Framework for quantifying uncertainty in computational predictions."""
    
    def __init__(self):
        self.bootstrap_models = []
        self.uncertainty_metrics = {}
        
    def bootstrap_ensemble(self, X, y, n_estimators=100, sample_fraction=0.8):
        """Create bootstrap ensemble for uncertainty estimation."""
        self.bootstrap_models = []
        n_samples = int(len(X) * sample_fraction)
        
        for i in range(n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(len(X), n_samples, replace=True)
            X_boot = X[indices]
            y_boot = y[indices]
            
            # Train model on bootstrap sample
            model = RandomForestRegressor(n_estimators=50, random_state=i)
            model.fit(X_boot, y_boot)
            self.bootstrap_models.append(model)
            
        return self.bootstrap_models
    
    def predict_with_uncertainty(self, X):
        """Make predictions with uncertainty estimates."""
        if not self.bootstrap_models:
            raise ValueError("Bootstrap ensemble not trained")
            
        # Get predictions from all models
        predictions = np.array([model.predict(X) for model in self.bootstrap_models])
        
        # Calculate statistics
        mean_pred = np.mean(predictions, axis=0)
        std_pred = np.std(predictions, axis=0)
        
        # Confidence intervals (95%)
        ci_lower = np.percentile(predictions, 2.5, axis=0)
        ci_upper = np.percentile(predictions, 97.5, axis=0)
        
        return {
            'mean': mean_pred,
            'std': std_pred,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'all_predictions': predictions
        }
    
    def calculate_prediction_intervals(self, X, confidence_level=0.95):
        """Calculate prediction intervals."""
        results = self.predict_with_uncertainty(X)
        
        alpha = 1 - confidence_level
        lower_percentile = (alpha/2) * 100
        upper_percentile = (1 - alpha/2) * 100
        
        lower_bound = np.percentile(results['all_predictions'], lower_percentile, axis=0)
        upper_bound = np.percentile(results['all_predictions'], upper_percentile, axis=0)
        
        interval_width = upper_bound - lower_bound
        
        return {
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'interval_width': interval_width,
            'confidence_level': confidence_level
        }
    
    def uncertainty_vs_error_analysis(self, X, y_true):
        """Analyze relationship between uncertainty and prediction error."""
        results = self.predict_with_uncertainty(X)
        
        predictions = results['mean']
        uncertainties = results['std']
        
        # Calculate absolute errors
        abs_errors = np.abs(predictions - y_true)
        
        # Correlation between uncertainty and error
        correlation = np.corrcoef(uncertainties, abs_errors)[0, 1]
        
        # Calibration: check if uncertainties are well-calibrated
        sorted_indices = np.argsort(uncertainties)
        n_bins = 10
        bin_size = len(sorted_indices) // n_bins
        
        calibration_data = []
        for i in range(n_bins):
            start_idx = i * bin_size
            end_idx = (i + 1) * bin_size if i < n_bins - 1 else len(sorted_indices)
            bin_indices = sorted_indices[start_idx:end_idx]
            
            bin_uncertainty = np.mean(uncertainties[bin_indices])
            bin_error = np.mean(abs_errors[bin_indices])
            
            calibration_data.append({
                'bin': i + 1,
                'mean_uncertainty': bin_uncertainty,
                'mean_error': bin_error
            })
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Uncertainty vs Error scatter plot
        axes[0, 0].scatter(uncertainties, abs_errors, alpha=0.6)
        axes[0, 0].set_xlabel('Prediction Uncertainty (Std Dev)')
        axes[0, 0].set_ylabel('Absolute Error')
        axes[0, 0].set_title(f'Uncertainty vs Error (r = {correlation:.3f})')
        
        # Prediction intervals visualization
        sample_indices = np.random.choice(len(X), min(50, len(X)), replace=False)
        x_pos = range(len(sample_indices))
        
        axes[0, 1].errorbar(x_pos, predictions[sample_indices], 
                           yerr=1.96*uncertainties[sample_indices], 
                           fmt='o', alpha=0.7, capsize=3)
        axes[0, 1].scatter(x_pos, y_true[sample_indices], color='red', alpha=0.7, label='True Values')
        axes[0, 1].set_xlabel('Sample Index')
        axes[0, 1].set_ylabel('Activity Value')
        axes[0, 1].set_title('Prediction Intervals (95% CI)')
        axes[0, 1].legend()
        
        # Calibration plot
        cal_df = pd.DataFrame(calibration_data)
        axes[1, 0].plot(cal_df['mean_uncertainty'], cal_df['mean_error'], 'bo-')
        axes[1, 0].plot([0, cal_df['mean_uncertainty'].max()], [0, cal_df['mean_uncertainty'].max()], 'r--')
        axes[1, 0].set_xlabel('Mean Predicted Uncertainty')
        axes[1, 0].set_ylabel('Mean Absolute Error')
        axes[1, 0].set_title('Uncertainty Calibration')
        
        # Uncertainty distribution
        axes[1, 1].hist(uncertainties, bins=30, alpha=0.7, edgecolor='black')
        axes[1, 1].set_xlabel('Prediction Uncertainty')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].set_title('Distribution of Uncertainties')
        
        plt.tight_layout()
        plt.show()
        
        return {
            'uncertainty_error_correlation': correlation,
            'calibration_data': calibration_data,
            'mean_uncertainty': np.mean(uncertainties),
            'mean_absolute_error': np.mean(abs_errors)
        }

# Task 3 Implementation
print("\n=== Task 3: Uncertainty Quantification ===")

uq = UncertaintyQuantification()

# Use the same data from previous tasks
X = data[features].values
y = data['activity'].values

# Split data for uncertainty analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n1. Training bootstrap ensemble...")
bootstrap_models = uq.bootstrap_ensemble(X_train_scaled, y_train, n_estimators=50)
print(f"Trained {len(bootstrap_models)} bootstrap models")

print("\n2. Making predictions with uncertainty...")
uncertainty_results = uq.predict_with_uncertainty(X_test_scaled)
print(f"Mean prediction uncertainty: {np.mean(uncertainty_results['std']):.3f}")
print(f"Range of uncertainties: {np.min(uncertainty_results['std']):.3f} - {np.max(uncertainty_results['std']):.3f}")

print("\n3. Calculating prediction intervals...")
intervals = uq.calculate_prediction_intervals(X_test_scaled, confidence_level=0.95)
print(f"Mean 95% prediction interval width: {np.mean(intervals['interval_width']):.3f}")

print("\n4. Analyzing uncertainty vs error relationship...")
analysis_results = uq.uncertainty_vs_error_analysis(X_test_scaled, y_test)

print("\nUncertainty Analysis Results:")
for key, value in analysis_results.items():
    if isinstance(value, (int, float)):
        print(f"{key}: {value:.3f}")

# Update progress for Task 3
task_scores['task_3_uncertainty_quantification'] = 25
skills_developed['uncertainty_analysis'] = True
tasks_completed += 1
current_score += 25

print(f"\n‚úì Task 3 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Task 4: Comprehensive Model Evaluation (25 points)

Develop comprehensive frameworks for evaluating model performance across multiple dimensions.

In [None]:
class ComprehensiveModelEvaluation:
    """Framework for comprehensive model evaluation and comparison."""
    
    def __init__(self):
        self.evaluation_results = {}
        self.comparison_metrics = {}
        
    def cross_validation_analysis(self, models, X, y, cv_folds=5):
        """Perform comprehensive cross-validation analysis."""
        results = {}
        
        for name, model in models.items():
            print(f"\nEvaluating {name}...")
            
            # Cross-validation scores
            cv_scores = {
                'r2': cross_val_score(model, X, y, cv=cv_folds, scoring='r2'),
                'neg_mae': cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_absolute_error'),
                'neg_rmse': cross_val_score(model, X, y, cv=cv_folds, scoring='neg_root_mean_squared_error')
            }
            
            # Calculate statistics
            stats = {}
            for metric, scores in cv_scores.items():
                if metric.startswith('neg_'):
                    scores = -scores  # Convert negative scores to positive
                    metric_name = metric[4:]  # Remove 'neg_' prefix
                else:
                    metric_name = metric
                    
                stats[metric_name] = {
                    'mean': np.mean(scores),
                    'std': np.std(scores),
                    'min': np.min(scores),
                    'max': np.max(scores),
                    'scores': scores
                }
            
            results[name] = stats
            
        return results
    
    def learning_curve_analysis(self, model, X, y, train_sizes=None):
        """Analyze learning curves to assess model performance vs training size."""
        from sklearn.model_selection import learning_curve
        
        if train_sizes is None:
            train_sizes = np.linspace(0.1, 1.0, 10)
        
        train_sizes_abs, train_scores, val_scores = learning_curve(
            model, X, y, train_sizes=train_sizes, cv=5, scoring='r2'
        )
        
        # Calculate statistics
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)
        
        # Plot learning curves
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes_abs, train_mean, 'o-', color='blue', label='Training Score')
        plt.fill_between(train_sizes_abs, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
        
        plt.plot(train_sizes_abs, val_mean, 'o-', color='red', label='Validation Score')
        plt.fill_between(train_sizes_abs, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
        
        plt.xlabel('Training Set Size')
        plt.ylabel('R¬≤ Score')
        plt.title('Learning Curves')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()
        
        return {
            'train_sizes': train_sizes_abs,
            'train_scores': {'mean': train_mean, 'std': train_std},
            'val_scores': {'mean': val_mean, 'std': val_std}
        }
    
    def feature_importance_analysis(self, models, feature_names):
        """Analyze feature importance across different models."""
        importance_data = {}
        
        for name, model in models.items():
            if hasattr(model, 'feature_importances_'):
                importance_data[name] = model.feature_importances_
            elif hasattr(model, 'coef_'):
                importance_data[name] = np.abs(model.coef_)
        
        if not importance_data:
            print("No feature importance data available for provided models")
            return None
        
        # Create comparison plot
        fig, ax = plt.subplots(figsize=(12, 8))
        
        x = np.arange(len(feature_names))
        width = 0.8 / len(importance_data)
        
        for i, (model_name, importances) in enumerate(importance_data.items()):
            ax.bar(x + i * width, importances, width, label=model_name, alpha=0.8)
        
        ax.set_xlabel('Features')
        ax.set_ylabel('Importance')
        ax.set_title('Feature Importance Comparison')
        ax.set_xticks(x + width * (len(importance_data) - 1) / 2)
        ax.set_xticklabels(feature_names, rotation=45, ha='right')
        ax.legend()
        
        plt.tight_layout()
        plt.show()
        
        return importance_data
    
    def model_interpretability_analysis(self, models, X_sample, feature_names):
        """Analyze model interpretability using various methods."""
        interpretability_results = {}
        
        for name, model in models.items():
            print(f"\nAnalyzing interpretability for {name}...")
            
            # Feature importance (if available)
            if hasattr(model, 'feature_importances_'):
                top_features = np.argsort(model.feature_importances_)[-10:]
                interpretability_results[name] = {
                    'top_features': [feature_names[i] for i in top_features],
                    'top_importances': model.feature_importances_[top_features]
                }
            
            # Prediction analysis on sample
            sample_pred = model.predict(X_sample[:5])  # First 5 samples
            interpretability_results[name]['sample_predictions'] = sample_pred
        
        return interpretability_results
    
    def performance_stability_analysis(self, models, X, y, n_runs=10):
        """Analyze performance stability across multiple runs."""
        stability_results = {}
        
        for name, model in models.items():
            print(f"Analyzing stability for {name}...")
            
            scores = []
            for run in range(n_runs):
                # Random train/test split for each run
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.3, random_state=run
                )
                
                # Clone and train model
                from sklearn.base import clone
                model_clone = clone(model)
                model_clone.fit(X_train, y_train)
                
                # Evaluate
                score = model_clone.score(X_test, y_test)
                scores.append(score)
            
            stability_results[name] = {
                'scores': scores,
                'mean': np.mean(scores),
                'std': np.std(scores),
                'coefficient_of_variation': np.std(scores) / np.mean(scores)
            }
        
        # Visualization
        fig, ax = plt.subplots(figsize=(10, 6))
        
        model_names = list(stability_results.keys())
        means = [stability_results[name]['mean'] for name in model_names]
        stds = [stability_results[name]['std'] for name in model_names]
        
        x_pos = np.arange(len(model_names))
        ax.bar(x_pos, means, yerr=stds, capsize=5, alpha=0.7)
        ax.set_xlabel('Models')
        ax.set_ylabel('R¬≤ Score')
        ax.set_title('Model Performance Stability')
        ax.set_xticks(x_pos)
        ax.set_xticklabels(model_names, rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        return stability_results
    
    def generate_evaluation_report(self, all_results):
        """Generate comprehensive evaluation report."""
        print("\n" + "="*60)
        print("COMPREHENSIVE MODEL EVALUATION REPORT")
        print("="*60)
        
        # Summary statistics
        if 'cross_validation' in all_results:
            print("\n1. CROSS-VALIDATION RESULTS:")
            cv_results = all_results['cross_validation']
            
            for model_name, metrics in cv_results.items():
                print(f"\n{model_name}:")
                for metric_name, stats in metrics.items():
                    print(f"  {metric_name}: {stats['mean']:.3f} ¬± {stats['std']:.3f}")
        
        # Stability analysis
        if 'stability' in all_results:
            print("\n2. PERFORMANCE STABILITY:")
            stability_results = all_results['stability']
            
            for model_name, stats in stability_results.items():
                cv = stats['coefficient_of_variation']
                print(f"  {model_name}: CV = {cv:.3f} (lower is more stable)")
        
        # Feature importance
        if 'feature_importance' in all_results:
            print("\n3. FEATURE IMPORTANCE ANALYSIS:")
            print("  Feature importance data available for model comparison")
        
        # Model recommendations
        print("\n4. RECOMMENDATIONS:")
        
        if 'cross_validation' in all_results:
            # Find best performing model
            best_model = None
            best_score = -np.inf
            
            for model_name, metrics in all_results['cross_validation'].items():
                r2_score = metrics['r2']['mean']
                if r2_score > best_score:
                    best_score = r2_score
                    best_model = model_name
            
            print(f"  - Best overall performance: {best_model} (R¬≤ = {best_score:.3f})")
        
        if 'stability' in all_results:
            # Find most stable model
            most_stable = min(all_results['stability'].items(), 
                            key=lambda x: x[1]['coefficient_of_variation'])
            print(f"  - Most stable model: {most_stable[0]} (CV = {most_stable[1]['coefficient_of_variation']:.3f})")
        
        print("\n" + "="*60)
        
        return all_results

# Task 4 Implementation
print("\n=== Task 4: Comprehensive Model Evaluation ===")

evaluator = ComprehensiveModelEvaluation()

# Prepare models for evaluation
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Use the scaled data from previous tasks
X_scaled = scaler.fit_transform(X)

print("\n1. Cross-validation analysis...")
cv_results = evaluator.cross_validation_analysis(models, X_scaled, y)

print("\n2. Learning curve analysis...")
learning_results = evaluator.learning_curve_analysis(
    models['Random Forest'], X_scaled, y
)

print("\n3. Feature importance analysis...")
# Train models first for feature importance
for model in models.values():
    model.fit(X_scaled, y)

feature_importance = evaluator.feature_importance_analysis(models, features)

print("\n4. Performance stability analysis...")
stability_results = evaluator.performance_stability_analysis(models, X_scaled, y)

print("\n5. Model interpretability analysis...")
interpretability_results = evaluator.model_interpretability_analysis(
    models, X_scaled[:10], features
)

# Compile all results
all_evaluation_results = {
    'cross_validation': cv_results,
    'learning_curves': learning_results,
    'feature_importance': feature_importance,
    'stability': stability_results,
    'interpretability': interpretability_results
}

print("\n6. Generating comprehensive evaluation report...")
final_report = evaluator.generate_evaluation_report(all_evaluation_results)

# Update progress for Task 4
task_scores['task_4_model_evaluation'] = 25
skills_developed['model_evaluation'] = True
tasks_completed += 1
current_score += 25

print(f"\n‚úì Task 4 completed! Score: 25/25")
print(f"Progress: {tasks_completed}/4 tasks completed")
print(f"Current Score: {current_score}/{total_points} points")

## Week 10 Summary and Assessment

### Learning Outcomes Achieved
- ‚úì Integrated multiple computational approaches into unified workflows
- ‚úì Validated computational predictions against experimental data
- ‚úì Developed comprehensive model evaluation frameworks
- ‚úì Implemented cross-validation and uncertainty quantification

### Skills Developed
- Workflow design and integration
- Experimental validation strategies
- Uncertainty quantification methods
- Comprehensive model evaluation

### Key Achievements
1. **Workflow Integration**: Successfully combined QM, MD, and ML approaches
2. **Experimental Validation**: Developed validation frameworks with statistical analysis
3. **Uncertainty Quantification**: Implemented bootstrap ensembles and prediction intervals
4. **Model Evaluation**: Created comprehensive evaluation frameworks with stability analysis

### Next Steps
- Week 11: Final project preparation and portfolio development
- Week 12: Portfolio completion and peer assessment

In [None]:
# Final Week 10 Progress Update
print("\n" + "="*60)
print("WEEK 10 CHECKPOINT COMPLETION SUMMARY")
print("="*60)

print(f"\nWeek {week_number}: {week_topic}")
print(f"Total Tasks: 4")
print(f"Tasks Completed: {tasks_completed}")
print(f"Final Score: {current_score}/{total_points} points")
print(f"Completion Rate: {(current_score/total_points)*100:.1f}%")

print("\nTask Breakdown:")
for task, score in task_scores.items():
    status = "‚úì" if score > 0 else "‚úó"
    print(f"  {status} {task.replace('_', ' ').title()}: {score}/25 points")

print("\nSkills Developed:")
for skill, achieved in skills_developed.items():
    status = "‚úì" if achieved else "‚úó"
    print(f"  {status} {skill.replace('_', ' ').title()}")

if current_score == total_points:
    print("\nüéâ WEEK 10 CHECKPOINT COMPLETED SUCCESSFULLY! üéâ")
    print("Ready to proceed to Week 11: Final Project Preparation")
else:
    print(f"\n‚ö†Ô∏è  Week 10 partially completed: {current_score}/{total_points} points")
    print("Review incomplete tasks before proceeding to Week 11")

print("\n" + "="*60)