In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score,KFold,StratifiedKFold,LeaveOneOut,learning_curve,validation_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [10]:
def load_data():
    data = load_breast_cancer()
    return data.data, data.target

In [11]:
def get_models():
    models = {
        "Logistic Regression": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=500))
        ]),
        "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
        "SVM (RBF)": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", SVC(kernel="rbf", C=1, gamma="scale"))
        ])
    }
    return models

In [12]:
def evaluate_cross_validation(X, y, models):

    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    loocv = LeaveOneOut()

    results = {}

    print("\nCross-Validation Results")

    for name, model in models.items():
        print(f"\n{name}")

        scores_kfold = cross_val_score(model, X, y, cv=kfold)
        scores_strat = cross_val_score(model, X, y, cv=stratified)
        scores_loocv = cross_val_score(model, X, y, cv=loocv)

        results[name] = {
            "KFold": scores_kfold,
            "StratifiedKFold": scores_strat,
            "LOOCV": scores_loocv,
        }

        print(f"KFold Mean: {scores_kfold.mean():.4f} ± {scores_kfold.std():.4f}")
        print(f"Stratified Mean: {scores_strat.mean():.4f} ± {scores_strat.std():.4f}")
        print(f"LOOCV Mean: {scores_loocv.mean():.4f} ± {scores_loocv.std():.4f}")

    return results

In [13]:
def confidence_interval(scores, confidence=0.95):
    
    mean = np.mean(scores)
    std = np.std(scores)
    n = len(scores)
    margin = 1.96 * std / np.sqrt(n)
    return mean, mean - margin, mean + margin


In [14]:
def plot_learning_curves(X, y, models):
    plt.figure(figsize=(10, 6))

    for name, model in models.items():
        train_sizes, train_scores, test_scores = learning_curve(
            model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 5)
        )

        train_mean = train_scores.mean(axis=1)
        test_mean = test_scores.mean(axis=1)

        plt.plot(train_sizes, test_mean, label=name)

    plt.title("Learning Curv es")
    plt.xlabel("Training Size")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid()
    plt.show()

In [15]:
def plot_validation_curves(X, y):
    """Create validation curves for key hyperparameters."""

    plt.figure(figsize=(10, 6))

    # Logistic Regression: C parameter
    param_range = [0.01, 0.1, 1, 10, 100]
    train_scores, test_scores = validation_curve(
        Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=500))
        ]),
        X, y,
        param_name="clf__C",
        param_range=param_range,
        cv=5
    )

    plt.plot(param_range, test_scores.mean(axis=1), label="LogReg C")

    train_scores, test_scores = validation_curve(
        Pipeline([
            ("scaler", StandardScaler()),
            ("clf", SVC())
        ]),
        X, y,
        param_name="clf__gamma",
        param_range=[0.001, 0.01, 0.1, 1],
        cv=5
    )

    plt.plot([0.001, 0.01, 0.1, 1], test_scores.mean(axis=1), label="SVM gamma")

    plt.title("Validation Curves")
    plt.xlabel("Hyperparameter Value")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid()
    plt.show()


In [None]:
def main():
    X, y = load_data()
    models = get_models()

    results = evaluate_cross_validation(X, y, models)

    print("\nConfidence Intervals (KFold)")
    print("=" * 40)

    for name, scores in results.items():
        mean, low, high = confidence_interval(scores["KFold"])
        print(f"{name}: {mean:.4f} (95% CI: {low:.4f} – {high:.4f})")

    plot_learning_curves(X, y, models)
    plot_validation_curves(X, y)

    print("""
Recommendations:

K-Fold Cross-Validation
- Good general-purpose evaluation
- Works well when data is balanced and sufficient

Stratified K-Fold
- Preferred for imbalanced classification problems
- Preserves class distribution in each fold

Leave-One-Out CV
- Useful for very small datasets
- Computationally expensive for large datasets

Learning Curves
- Diagnose bias vs variance
- Detect overfitting or underfitting

Validation Curves
- Identify optimal hyperparameter values
- Reveal sensitivity to parameter changes
""")


if __name__ == "__main__":
    main()



Cross-Validation Results

Logistic Regression
KFold Mean: 0.9771 ± 0.0090
Stratified Mean: 0.9737 ± 0.0166
LOOCV Mean: 0.9789 ± 0.1437

Random Forest
