### Scaling & Feature Selection in a Pipeline
**Description**: Create a pipeline that includes feature scaling, variance threshold selection, and a classification model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------------------
# Step 1: Data Integrity and Error Handling
# ---------------------------------------

def validate_data(X, y):
    if not isinstance(X, pd.DataFrame):
        raise ValueError("X must be a pandas DataFrame.")
    if not isinstance(y, pd.Series):
        raise ValueError("y must be a pandas Series.")
    if X.empty or y.empty:
        raise ValueError("Input data is empty.")
    if X.isnull().sum().sum() > 0:
        raise ValueError("Input data contains missing values.")
    if y.nunique() < 2:
        raise ValueError("Target variable must have at least two classes.")

# Validate before training
validate_data(X_train, y_train)

# ---------------------------------------
# Step 2: Pipeline Definition
# ---------------------------------------

pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("feature_selector", VarianceThreshold(threshold=0.01)),
    ("classifier", RandomForestClassifier(random_state=42))
])

# ---------------------------------------
# Step 3: Train and Predict
# ---------------------------------------

try:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {acc:.2f}")
except Exception as e:
    print(f"Pipeline training or prediction failed: {e}")

# ---------------------------------------
# Step 4: Unit Tests for Robustness
# ---------------------------------------

def test_pipeline_components():
    # Test 1: Check if scaled output has same shape
    X_scaled = pipeline.named_steps['scaler'].transform(X_test)
    assert X_scaled.shape == X_test.shape, "Scaler output shape mismatch"

    # Test 2: Feature selection reduces features
    X_selected = pipeline.named_steps['feature_selector'].transform(X_scaled)
    assert X_selected.shape[1] <= X_scaled.shape[1], "Feature selector did not reduce features"

    # Test 3: Predict length matches test labels
    preds = pipeline.predict(X_test)
    assert len(preds) == len(y_test), "Prediction length mismatch"

    print("✅ All unit tests passed!")

# Run unit tests
test_pipeline_components()

Model Accuracy: 0.96
✅ All unit tests passed!
