In [None]:
# ChemML Integration Setupimport chemmlprint(f'🧪 ChemML {chemml.__version__} loaded for this notebook')

# Week 5 Checkpoint: Machine Learning Fundamentals for Drug Discovery

## Learning Objectives Verification
By the end of this week, you should be able to:
- Implement regression and classification models for QSAR
- Apply proper model validation and evaluation techniques
- Perform hyperparameter tuning and model selection
- Interpret model performance metrics in drug discovery context
- Handle imbalanced datasets common in bioactivity prediction

## Progress Tracking Dashboard
**Week:** 5/12  
**Module:** Machine Learning Fundamentals for Drug Discovery  
**Estimated Time:** 10-14 hours  
**Prerequisites:** Weeks 1-4 completed  

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings
warnings.filterwarnings('ignore')

# Progress tracking
progress_tracker = {
    'week': 5,
    'completed_tasks': [],
    'scores': {},
    'time_spent': 0,
    'challenges_faced': [],
    'next_steps': []
}

print("Week 5 Checkpoint: Machine Learning Fundamentals for Drug Discovery")
print("=" * 70)

## Task 1: QSAR Regression Model Development (25 points)

Build and evaluate regression models to predict molecular properties.

In [None]:
# Task 1: QSAR Regression Model Development
print("Task 1: Building QSAR regression models...")

# Create synthetic dataset for LogP prediction
np.random.seed(42)

# Generate molecular descriptors (features)
n_samples = 200
molecular_weight = np.random.normal(350, 100, n_samples)
n_rotatable_bonds = np.random.poisson(5, n_samples)
n_aromatic_rings = np.random.poisson(2, n_samples)
tpsa = np.random.normal(80, 30, n_samples)
n_hbd = np.random.poisson(2, n_samples)
n_hba = np.random.poisson(4, n_samples)

# Create realistic LogP values based on molecular properties
logp_base = (molecular_weight * 0.01 + 
             n_aromatic_rings * 0.5 - 
             tpsa * 0.02 - 
             n_hbd * 0.3 - 
             n_hba * 0.2 + 
             np.random.normal(0, 0.5, n_samples))

# Create DataFrame
qsar_data = pd.DataFrame({
    'MW': molecular_weight,
    'RotBonds': n_rotatable_bonds,
    'AromaticRings': n_aromatic_rings,
    'TPSA': tpsa,
    'HBD': n_hbd,
    'HBA': n_hba,
    'LogP': logp_base
})

# Remove outliers
qsar_data = qsar_data[(qsar_data['LogP'] >= -2) & (qsar_data['LogP'] <= 8)]
qsar_data = qsar_data[(qsar_data['MW'] > 100) & (qsar_data['MW'] < 800)]

print(f"Dataset created with {len(qsar_data)} compounds")
print("\nDataset statistics:")
print(qsar_data.describe().round(2))

# Prepare features and target
X = qsar_data.drop('LogP', axis=1)
y = qsar_data['LogP']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define regression models
models = {
    'Linear Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR(kernel='rbf', C=1.0, gamma='scale'))
    ])
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluate
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    model_results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred_test': y_pred_test
    }

# Display results
results_df = pd.DataFrame(model_results).T
print("\nModel Performance Comparison:")
print(results_df[['train_r2', 'test_r2', 'test_rmse', 'cv_mean', 'cv_std']].round(3))

# Visualize predictions vs actual
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, (name, results) in enumerate(model_results.items()):
    axes[i].scatter(y_test, results['y_pred_test'], alpha=0.6)
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i].set_xlabel('Actual LogP')
    axes[i].set_ylabel('Predicted LogP')
    axes[i].set_title(f'{name}\nR² = {results["test_r2"]:.3f}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

progress_tracker['completed_tasks'].append('Task 1: QSAR Regression Model Development')
progress_tracker['scores']['task_1'] = 25

## Task 2: Binary Classification for Bioactivity Prediction (25 points)

Build classification models to predict compound bioactivity (active/inactive).

In [None]:
# Task 2: Binary Classification for Bioactivity Prediction
print("Task 2: Building bioactivity classification models...")

# Create binary classification dataset
# Convert LogP to bioactivity labels (simplified example)
# In reality, this would be experimental bioactivity data

def create_bioactivity_labels(df):
    """
    Create bioactivity labels based on molecular properties
    This is a simplified example - real data would come from assays
    """
    # Create a compound bioactivity score based on drug-like properties
    activity_score = (
        (df['MW'] <= 500).astype(int) * 0.2 +
        (df['LogP'] <= 5).astype(int) * 0.2 +
        (df['HBD'] <= 5).astype(int) * 0.2 +
        (df['TPSA'] <= 140).astype(int) * 0.2 +
        (df['RotBonds'] <= 10).astype(int) * 0.2 +
        np.random.normal(0, 0.1, len(df))  # Add some noise
    )
    
    # Convert to binary labels
    # Active if score > 0.6, inactive otherwise
    return (activity_score > 0.6).astype(int)

# Create classification dataset
y_class = create_bioactivity_labels(qsar_data)
X_class = qsar_data.drop('LogP', axis=1)

print(f"Classification dataset created:")
print(f"  Total compounds: {len(y_class)}")
print(f"  Active compounds: {sum(y_class)} ({sum(y_class)/len(y_class)*100:.1f}%)")
print(f"  Inactive compounds: {len(y_class)-sum(y_class)} ({(len(y_class)-sum(y_class))/len(y_class)*100:.1f}%)")

# Split data
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.3, random_state=42, stratify=y_class
)

# Define classification models
clf_models = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(random_state=42))
    ]),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(probability=True, random_state=42))
    ])
}

# Train and evaluate classification models
clf_results = {}

for name, model in clf_models.items():
    # Fit model
    model.fit(X_train_c, y_train_c)
    
    # Predictions
    y_pred_c = model.predict(X_test_c)
    y_prob_c = model.predict_proba(X_test_c)[:, 1]
    
    # Evaluate
    accuracy = model.score(X_test_c, y_test_c)
    auc_score = roc_auc_score(y_test_c, y_prob_c)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_c, y_train_c, cv=5, scoring='roc_auc')
    
    clf_results[name] = {
        'accuracy': accuracy,
        'auc': auc_score,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred': y_pred_c,
        'y_prob': y_prob_c
    }

# Display classification results
clf_results_df = pd.DataFrame(clf_results).T
print("\nClassification Model Performance:")
print(clf_results_df[['accuracy', 'auc', 'cv_mean', 'cv_std']].round(3))

# Plot ROC curves
plt.figure(figsize=(10, 6))

for name, results in clf_results.items():
    fpr, tpr, _ = roc_curve(y_test_c, results['y_prob'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {results["auc"]:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Bioactivity Classification')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, (name, results) in enumerate(clf_results.items()):
    cm = confusion_matrix(y_test_c, results['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', ax=axes[i], cmap='Blues')
    axes[i].set_title(f'{name} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.show()

progress_tracker['completed_tasks'].append('Task 2: Binary Classification for Bioactivity Prediction')
progress_tracker['scores']['task_2'] = 25

## Task 3: Hyperparameter Tuning and Model Selection (25 points)

Optimize model performance through systematic hyperparameter tuning.

In [None]:
# Task 3: Hyperparameter Tuning and Model Selection
print("Task 3: Hyperparameter tuning and model optimization...")

# Define hyperparameter grids for different models
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'model__C': [0.1, 1, 10, 100],
        'model__gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
    }
}

# Hyperparameter tuning for regression (Random Forest)
print("\nTuning Random Forest for LogP prediction...")

rf_reg = RandomForestRegressor(random_state=42)
grid_search_reg = GridSearchCV(
    rf_reg, 
    param_grids['Random Forest'], 
    cv=5, 
    scoring='r2',
    n_jobs=-1
)

grid_search_reg.fit(X_train, y_train)

print(f"Best parameters: {grid_search_reg.best_params_}")
print(f"Best CV score: {grid_search_reg.best_score_:.3f}")

# Evaluate tuned model
best_rf_reg = grid_search_reg.best_estimator_
y_pred_tuned = best_rf_reg.predict(X_test)
tuned_r2 = r2_score(y_test, y_pred_tuned)
tuned_rmse = np.sqrt(mean_squared_error(y_test, y_pred_tuned))

print(f"Tuned model test R²: {tuned_r2:.3f}")
print(f"Tuned model test RMSE: {tuned_rmse:.3f}")

# Hyperparameter tuning for classification (SVM)
print("\nTuning SVM for bioactivity classification...")

svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(probability=True, random_state=42))
])

grid_search_clf = GridSearchCV(
    svm_clf, 
    param_grids['SVM'], 
    cv=5, 
    scoring='roc_auc',
    n_jobs=-1
)

grid_search_clf.fit(X_train_c, y_train_c)

print(f"Best parameters: {grid_search_clf.best_params_}")
print(f"Best CV score: {grid_search_clf.best_score_:.3f}")

# Evaluate tuned classifier
best_svm_clf = grid_search_clf.best_estimator_
y_pred_tuned_c = best_svm_clf.predict(X_test_c)
y_prob_tuned_c = best_svm_clf.predict_proba(X_test_c)[:, 1]
tuned_accuracy = best_svm_clf.score(X_test_c, y_test_c)
tuned_auc = roc_auc_score(y_test_c, y_prob_tuned_c)

print(f"Tuned classifier accuracy: {tuned_accuracy:.3f}")
print(f"Tuned classifier AUC: {tuned_auc:.3f}")

# Feature importance analysis for tuned Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf_reg.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance - Tuned Random Forest (LogP Prediction)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

# Learning curves to check for overfitting
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, title, cv=5):
    """
    Plot learning curves to diagnose overfitting
    """
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='r2' if 'Regressor' in str(type(estimator)) else 'roc_auc'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', label='Training Score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
    plt.plot(train_sizes, val_mean, 'o-', label='Validation Score')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1)
    
    plt.xlabel('Training Set Size')
    plt.ylabel('Score')
    plt.title(f'Learning Curve - {title}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Plot learning curves for tuned models
plot_learning_curve(best_rf_reg, X_train, y_train, 'Tuned Random Forest (Regression)')
plot_learning_curve(best_svm_clf, X_train_c, y_train_c, 'Tuned SVM (Classification)')

progress_tracker['completed_tasks'].append('Task 3: Hyperparameter Tuning and Model Selection')
progress_tracker['scores']['task_3'] = 25

## Task 4: Handling Imbalanced Data in Drug Discovery (25 points)

Address class imbalance issues common in bioactivity datasets.

In [None]:
# Task 4: Handling Imbalanced Data in Drug Discovery
print("Task 4: Handling imbalanced bioactivity data...")

# Create a more realistic imbalanced dataset
# In drug discovery, active compounds are often much rarer
def create_imbalanced_dataset(df, active_ratio=0.1):
    """
    Create an imbalanced dataset mimicking real drug discovery scenarios
    """
    n_total = len(df)
    n_active = int(n_total * active_ratio)
    
    # Create labels with strong bias toward drug-like properties for actives
    activity_prob = (
        (df['MW'] <= 450).astype(int) * 0.3 +
        (df['LogP'] >= 1).astype(int) * (df['LogP'] <= 4).astype(int) * 0.3 +
        (df['HBD'] <= 3).astype(int) * 0.2 +
        (df['TPSA'] <= 120).astype(int) * 0.2
    )
    
    # Add noise and make it more selective
    activity_prob += np.random.normal(0, 0.1, len(df))
    
    # Select top compounds as active
    active_threshold = np.percentile(activity_prob, 100 - active_ratio * 100)
    labels = (activity_prob >= active_threshold).astype(int)
    
    return labels

# Create imbalanced dataset
y_imbal = create_imbalanced_dataset(qsar_data, active_ratio=0.15)
X_imbal = qsar_data.drop('LogP', axis=1)

print(f"Imbalanced dataset created:")
print(f"  Total compounds: {len(y_imbal)}")
print(f"  Active compounds: {sum(y_imbal)} ({sum(y_imbal)/len(y_imbal)*100:.1f}%)")
print(f"  Inactive compounds: {len(y_imbal)-sum(y_imbal)} ({(len(y_imbal)-sum(y_imbal))/len(y_imbal)*100:.1f}%)")

# Split data
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_imbal, y_imbal, test_size=0.3, random_state=42, stratify=y_imbal
)

# Compare approaches for handling imbalanced data
approaches = {
    'Baseline': {
        'model': RandomForestClassifier(n_estimators=100, random_state=42),
        'X_train': X_train_i,
        'y_train': y_train_i
    },
    'Class Weight': {
        'model': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
        'X_train': X_train_i,
        'y_train': y_train_i
    }
}

# SMOTE oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_i, y_train_i)

approaches['SMOTE'] = {
    'model': RandomForestClassifier(n_estimators=100, random_state=42),
    'X_train': X_train_smote,
    'y_train': y_train_smote
}

print(f"\nAfter SMOTE:")
print(f"  Training set size: {len(y_train_smote)}")
print(f"  Active: {sum(y_train_smote)} ({sum(y_train_smote)/len(y_train_smote)*100:.1f}%)")
print(f"  Inactive: {len(y_train_smote)-sum(y_train_smote)} ({(len(y_train_smote)-sum(y_train_smote))/len(y_train_smote)*100:.1f}%)")

# Evaluate all approaches
imbalanced_results = {}

for name, config in approaches.items():
    # Train model
    model = config['model']
    model.fit(config['X_train'], config['y_train'])
    
    # Predictions
    y_pred = model.predict(X_test_i)
    y_prob = model.predict_proba(X_test_i)[:, 1]
    
    # Evaluation metrics
    from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
    
    accuracy = model.score(X_test_i, y_test_i)
    balanced_acc = balanced_accuracy_score(y_test_i, y_pred)
    precision = precision_score(y_test_i, y_pred)
    recall = recall_score(y_test_i, y_pred)
    f1 = f1_score(y_test_i, y_pred)
    auc = roc_auc_score(y_test_i, y_prob)
    
    imbalanced_results[name] = {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'y_prob': y_prob
    }

# Display results
imbal_results_df = pd.DataFrame(imbalanced_results).T
print("\nImbalanced Data Handling Results:")
print(imbal_results_df[['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'auc']].round(3))

# Plot comparison
metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'auc']
x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))

for i, (approach, results) in enumerate(imbalanced_results.items()):
    values = [results[metric] for metric in metrics]
    ax.bar(x + i*width, values, width, label=approach)

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Performance Comparison: Handling Imbalanced Data')
ax.set_xticks(x + width)
ax.set_xticklabels(metrics, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# ROC curves for imbalanced data approaches
plt.figure(figsize=(10, 6))

for name, results in imbalanced_results.items():
    fpr, tpr, _ = roc_curve(y_test_i, results['y_prob'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {results["auc"]:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves: Imbalanced Data Handling Approaches')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Analysis and recommendations
print("\nAnalysis and Recommendations:")
best_f1 = max(imbalanced_results.items(), key=lambda x: x[1]['f1'])
best_recall = max(imbalanced_results.items(), key=lambda x: x[1]['recall'])
best_auc = max(imbalanced_results.items(), key=lambda x: x[1]['auc'])

print(f"  Best F1-score: {best_f1[0]} ({best_f1[1]['f1']:.3f})")
print(f"  Best Recall: {best_recall[0]} ({best_recall[1]['recall']:.3f})")
print(f"  Best AUC: {best_auc[0]} ({best_auc[1]['auc']:.3f})")

print("\n  Key Insights:")
print("  - SMOTE often improves recall for minority class (active compounds)")
print("  - Class weighting is computationally efficient alternative")
print("  - Consider domain-specific metrics (precision vs recall trade-off)")
print("  - AUC-ROC is less sensitive to class imbalance than accuracy")

progress_tracker['completed_tasks'].append('Task 4: Handling Imbalanced Data')
progress_tracker['scores']['task_4'] = 25

## Self-Assessment and Reflection

Complete this self-assessment to evaluate your understanding of machine learning fundamentals for drug discovery.

In [None]:
# Self-Assessment Questions
print("SELF-ASSESSMENT: Machine Learning Fundamentals for Drug Discovery")
print("=" * 70)

assessment_questions = [
    {
        'question': 'Which metric is most appropriate for evaluating a highly imbalanced bioactivity dataset?',
        'options': ['A) Accuracy', 'B) AUC-ROC', 'C) Mean Squared Error', 'D) R-squared'],
        'correct': 'B',
        'explanation': 'AUC-ROC is less sensitive to class imbalance and better evaluates model discrimination.'
    },
    {
        'question': 'What is the primary risk of overfitting in drug discovery models?',
        'options': ['A) Poor training performance', 'B) High computational cost', 
                   'C) Poor generalization to new compounds', 'D) Slow prediction speed'],
        'correct': 'C',
        'explanation': 'Overfitted models perform well on training data but fail on new, unseen compounds.'
    },
    {
        'question': 'Why is cross-validation particularly important in QSAR modeling?',
        'options': ['A) To speed up training', 'B) To assess model stability and generalizability', 
                   'C) To reduce memory usage', 'D) To improve feature selection'],
        'correct': 'B',
        'explanation': 'Cross-validation provides robust estimates of model performance on unseen data.'
    },
    {
        'question': 'When would you prefer high recall over high precision in drug discovery?',
        'options': ['A) When screening for toxic compounds', 'B) When identifying drug candidates', 
                   'C) When optimizing lead compounds', 'D) Never'],
        'correct': 'B',
        'explanation': 'High recall ensures you don\'t miss potential drug candidates (minimize false negatives).'
    },
    {
        'question': 'What is the main advantage of Random Forest over linear models for molecular data?',
        'options': ['A) Faster training', 'B) Better interpretability', 
                   'C) Handling non-linear relationships', 'D) Lower memory usage'],
        'correct': 'C',
        'explanation': 'Random Forest can capture complex non-linear relationships between molecular features.'
    }
]

score = 0
for i, q in enumerate(assessment_questions, 1):
    print(f"\nQuestion {i}: {q['question']}")
    for option in q['options']:
        print(f"  {option}")
    
    # For demonstration, we'll show the correct answer
    print(f"\nCorrect Answer: {q['correct']}")
    print(f"Explanation: {q['explanation']}")
    score += 1  # Assuming correct for progress tracking

assessment_score = (score / len(assessment_questions)) * 100
progress_tracker['scores']['self_assessment'] = assessment_score

print(f"\nSelf-Assessment Score: {assessment_score:.0f}%")

## Week 5 Progress Summary and Next Steps

In [None]:
# Calculate overall progress
total_score = sum(progress_tracker['scores'].values())
max_score = 125  # 4 tasks × 25 points + 25 points assessment
overall_percentage = (total_score / max_score) * 100

progress_tracker['overall_score'] = overall_percentage
progress_tracker['time_spent'] = 12  # Estimated hours

print("WEEK 5 PROGRESS SUMMARY")
print("=" * 50)
print(f"Overall Score: {overall_percentage:.1f}%")
print(f"Time Spent: {progress_tracker['time_spent']} hours")
print(f"Tasks Completed: {len(progress_tracker['completed_tasks'])}/4")

print("\nTask Breakdown:")
for task, score in progress_tracker['scores'].items():
    if task != 'self_assessment':
        print(f"  {task}: {score}/25 points")
    else:
        print(f"  {task}: {score:.0f}%")

print("\nKey Learning Outcomes Achieved:")
outcomes = [
    "✓ Built and evaluated QSAR regression models",
    "✓ Implemented classification for bioactivity prediction",
    "✓ Performed systematic hyperparameter tuning",
    "✓ Addressed class imbalance in drug discovery data",
    "✓ Applied proper validation and evaluation techniques",
    "✓ Interpreted model performance in drug discovery context"
]
for outcome in outcomes:
    print(f"  {outcome}")

print("\nNext Week (Week 6) Preview:")
print("  📚 Topic: Deep Learning Applications in Drug Discovery")
print("  🎯 Focus: Neural networks, CNNs for molecular data")
print("  💡 Skills: Deep learning architectures, model interpretation")
print("  🔬 Practice: Build neural networks for ADMET prediction")

# Portfolio development checkpoint
print("\nPortfolio Development Checkpoint:")
print("  📊 Implement ML pipeline in your multi-target project")
print("  🔧 Add model comparison and validation framework")
print("  📈 Document hyperparameter tuning results")
print("  🎯 Optimize models for your specific targets")
print("  ⚖️ Address any class imbalance in your datasets")

# Advanced challenges for strong students
print("\nAdvanced Challenges (Optional):")
print("  🚀 Implement ensemble methods (voting, stacking)")
print("  📊 Explore Bayesian optimization for hyperparameter tuning")
print("  🔬 Try different evaluation strategies (time-series, group-based CV)")
print("  💡 Implement cost-sensitive learning for different error types")

# Save progress
import json
with open('week_05_progress.json', 'w') as f:
    # Convert numpy arrays to lists for JSON serialization
    clean_tracker = progress_tracker.copy()
    json.dump(clean_tracker, f, indent=2)

print("\n✅ Week 5 checkpoint completed! Progress saved to week_05_progress.json")
print("📝 Remember to update your learning journal with ML insights")
print("🚀 Ready to dive into Deep Learning in Week 6!")