Title: Cross-Validation


Task 1: K-Fold Cross-Validation for House Prices<br>
Apply K-Fold Cross-Validation (K=5) to check variability in performance.

In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

# Example synthetic dataset
data = {
    'size_sqft': [1500, 1800, 2400, 3000, 3500, 2000, 2800, 2300, 1700, 2600],
    'bedrooms': [3, 4, 3, 5, 4, 3, 4, 3, 2, 4],
    'price': [400000, 500000, 600000, 650000, 700000, 450000, 620000, 580000, 420000, 610000]
}

df = pd.DataFrame(data)
X = df[['size_sqft', 'bedrooms']]
y = df['price']

# Define model
model = LinearRegression()

# Define RMSE scorer (negative MSE with square root)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Setup K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Run cross-validation and get RMSE scores (negative because sklearn expects higher is better)
neg_rmse_scores = cross_val_score(model, X, y, scoring=rmse_scorer, cv=kf)

# Convert negative RMSE to positive
rmse_scores = -neg_rmse_scores

print("RMSE scores for each fold:", rmse_scores)
print(f"Mean RMSE: {rmse_scores.mean():.2f}")
print(f"Standard Deviation of RMSE: {rmse_scores.std():.2f}")


RMSE scores for each fold: [40786.38421711 55842.81165457 67152.44062873 35752.38027593
 22972.1335361 ]
Mean RMSE: 44501.23
Standard Deviation of RMSE: 15463.20


Task 2: Stratified K-Fold for Imbalanced Churn Dataset<br>
Use Stratified K-Fold to ensure each class is represented.

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Synthetic imbalanced churn dataset
data = {
    'age': [25, 45, 35, 50, 23, 40, 60, 48, 33, 36, 30, 28, 42, 55, 29],
    'monthly_charges': [70, 90, 80, 75, 60, 85, 95, 77, 66, 73, 68, 72, 88, 92, 65],
    'tenure_months': [12, 24, 18, 36, 8, 15, 40, 22, 10, 13, 20, 14, 26, 38, 16],
    'churn': [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0]  # 0 = no churn, 1 = churn (imbalanced)
}

df = pd.DataFrame(data)
X = df.drop('churn', axis=1).values
y = df['churn'].values

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
accuracies = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train a classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    print(f"Fold {fold} Accuracy: {acc:.2f}")
    fold += 1

print(f"\nMean Accuracy over 5 folds: {np.mean(accuracies):.2f}")


Fold 1 Accuracy: 0.67
Fold 2 Accuracy: 0.67
Fold 3 Accuracy: 0.67
Fold 4 Accuracy: 1.00
Fold 5 Accuracy: 0.67

Mean Accuracy over 5 folds: 0.73


Task 3: Leave-One-Out Cross-Validation for Iris<br>
Use LOOCV to assess model prediction for the Iris dataset.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import LeaveOneOut
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize Leave-One-Out CV
loo = LeaveOneOut()

y_true, y_pred = [], []

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict on the left-out sample
    pred = model.predict(X_test)
    
    y_true.append(y_test[0])
    y_pred.append(pred[0])


accuracy = accuracy_score(y_true, y_pred)
print(f"LOOCV Accuracy on Iris dataset: {accuracy:.2f}")


LOOCV Accuracy on Iris dataset: 0.94
