# End-to-End Machine Learning Project Demo

This notebook demonstrates a complete ML pipeline including:
- Data loading and exploration
- Feature preprocessing
- Model training and comparison
- Model evaluation
- Deployment simulation
- Comprehensive visualizations

In [None]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data.data_loader import DataLoader
from features.preprocessing import FeaturePreprocessor
from models.model_trainer import ModelTrainer
from models.model_deployment import ModelDeployment
from visualization.visualizer import MLVisualizer

## 1. Data Loading and Exploration

In [None]:
# Load dataset
loader = DataLoader('boston')  # Try 'wine' or 'breast_cancer' for different problems
X_train, X_test, y_train, y_test, df = loader.load_and_split()

print(f"Dataset: {loader.dataset_name}")
print(f"Problem Type: {loader.problem_type}")
print(f"Dataset Shape: {df.shape}")
print(f"Features: {X_train.shape[1]}")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Test Samples: {X_test.shape[0]}")

In [None]:
# Display basic statistics
print("Dataset Info:")
print(df.info())
print("\nDataset Description:")
df.describe()

## 2. Data Visualization

In [None]:
# Initialize visualizer
visualizer = MLVisualizer()

# Plot data distribution
visualizer.plot_data_distribution(df, target_col='target')

## 3. Feature Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = FeaturePreprocessor(loader.problem_type)

# Apply preprocessing pipeline
X_train_processed, X_test_processed = preprocessor.full_preprocessing_pipeline(
    X_train, y_train, X_test,
    scaler_type='standard',
    feature_selection=True,
    k_features=min(10, X_train.shape[1])
)

print(f"Original Features: {X_train.shape[1]}")
print(f"Processed Features: {X_train_processed.shape[1]}")
print("Applied: Scaling + Feature Selection")

## 4. Model Training and Comparison

In [None]:
# Initialize model trainer
trainer = ModelTrainer(loader.problem_type)

# Train all models
model_results = trainer.train_all_models(X_train_processed, y_train, X_test_processed, y_test)

# Display results
print(trainer.generate_model_summary())

In [None]:
# Visualize model comparison
visualizer.plot_model_comparison(model_results, loader.problem_type)

## 5. Best Model Analysis

In [None]:
# Get best model
best_model_info = trainer.get_best_model()
best_model_name = best_model_info['name']
best_predictions = model_results[best_model_name]['test_predictions']

print(f"Best Model: {best_model_name}")
print(f"Best {best_model_info['metric']}: {best_model_info['score']:.4f}")

# Plot predictions vs actual for best model
if loader.problem_type == 'regression':
    visualizer.plot_predictions_vs_actual(y_test, best_predictions, best_model_name)
else:
    visualizer.plot_confusion_matrix(y_test, best_predictions, best_model_name)

## 6. Feature Importance Analysis

In [None]:
# Plot feature importance if available
best_model = model_results[best_model_name]['model']
if hasattr(best_model, 'feature_importances_'):
    feature_names = [f'feature_{i}' for i in range(len(best_model.feature_importances_))]
    visualizer.plot_feature_importance(
        best_model.feature_importances_, 
        feature_names, 
        best_model_name
    )
else:
    print(f"{best_model_name} does not provide feature importance.")

## 7. Model Deployment Simulation

In [None]:
# Prepare for deployment
deployment = ModelDeployment()

# Package preprocessors
preprocessors = {
    'scaler': preprocessor.fitted_scaler,
    'feature_selector': preprocessor.feature_selector,
    'pca': preprocessor.pca
}

# Create metadata
metadata = {
    'dataset': loader.dataset_name,
    'problem_type': loader.problem_type,
    'best_model': best_model_info['name'],
    'performance': best_model_info['score'],
    'features': X_train.shape[1],
    'training_samples': X_train.shape[0]
}

# Save model for deployment
model_path = deployment.save_model_for_deployment(best_model, preprocessors, metadata)
print(f"Model saved for deployment: {model_path}")

In [None]:
# Load model and test API simulation
deployment.load_model_for_inference(model_path)

# Test with a sample
test_sample = X_test.iloc[0:1] if hasattr(X_test, 'iloc') else X_test[0:1]
api_response = deployment.simulate_api_endpoint(test_sample)

print("API Simulation Results:")
for key, value in api_response.items():
    print(f"  {key}: {value}")

# Health check
health_status = deployment.health_check()
print(f"\nHealth Check: {health_status}")

## 8. Cross-Validation Analysis

In [None]:
# Perform cross-validation on best model
X_full = np.vstack([X_train_processed, X_test_processed])
y_full = np.concatenate([y_train, y_test])

cv_results = trainer.cross_validate_model(X_full, y_full, best_model_name, cv=5)

print(f"Cross-Validation Results for {best_model_name}:")
print(f"  Mean Score: {cv_results['mean_score']:.4f} ± {cv_results['std_score']:.4f}")
print(f"  Individual Scores: {cv_results['all_scores']}")

## 9. Hyperparameter Tuning Example

In [None]:
# Perform hyperparameter tuning on Random Forest
if 'random_forest' in model_results:
    print("Performing hyperparameter tuning on Random Forest...")
    
    tuned_model, best_params, best_score = trainer.hyperparameter_tuning(
        X_train_processed, y_train, 'random_forest'
    )
    
    print(f"Best Parameters: {best_params}")
    print(f"Best CV Score: {best_score:.4f}")
    
    # Test tuned model
    tuned_predictions = tuned_model.predict(X_test_processed)
    
    if loader.problem_type == 'regression':
        from sklearn.metrics import r2_score
        tuned_score = r2_score(y_test, tuned_predictions)
        original_score = model_results['random_forest']['test_r2']
        print(f"Tuned Model Test R²: {tuned_score:.4f}")
        print(f"Original Model Test R²: {original_score:.4f}")
        print(f"Improvement: {tuned_score - original_score:.4f}")
    else:
        from sklearn.metrics import accuracy_score
        tuned_score = accuracy_score(y_test, tuned_predictions)
        original_score = model_results['random_forest']['test_accuracy']
        print(f"Tuned Model Test Accuracy: {tuned_score:.4f}")
        print(f"Original Model Test Accuracy: {original_score:.4f}")
        print(f"Improvement: {tuned_score - original_score:.4f}")

## 10. Project Summary

In [None]:
print("\n" + "="*60)
print("                PROJECT SUMMARY")
print("="*60)
print(f"Dataset: {loader.dataset_name} ({loader.problem_type})")
print(f"Original Features: {X_train.shape[1]}")
print(f"Processed Features: {X_train_processed.shape[1]}")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Test Samples: {X_test.shape[0]}")
print(f"Models Trained: {len(model_results)}")
print(f"Best Model: {best_model_info['name']}")
print(f"Best {best_model_info['metric']}: {best_model_info['score']:.4f}")
print("Deployment: Model packaged and API simulation tested")
print("Visualizations: Generated comprehensive plots")
print("="*60)