In [None]:
# Perform Bias Variance decompositions for the datasets – California housing, Iris and
# randomly generated data.



import numpy as np
from sklearn.datasets import fetch_california_housing, load_iris
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

# Helper function for bias-variance decomposition (regression)
def bias_variance_decomp_reg(model, X, y, n_rounds=50, test_size=0.3):
    y_preds = []
    y_tests = []
    for _ in range(n_rounds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_preds.append(y_pred)
        y_tests.append(y_test)
    y_preds = np.array(y_preds)
    y_tests = np.array(y_tests)
    avg_pred = np.mean(y_preds, axis=0)
    avg_true = np.mean(y_tests, axis=0)
    bias = np.mean((avg_pred - avg_true) ** 2)
    variance = np.mean(np.var(y_preds, axis=0))
    mse = np.mean((y_preds - y_tests) ** 2)
    return bias, variance, mse

# Helper function for bias-variance decomposition (classification)
def bias_variance_decomp_clf(model, X, y, n_rounds=50, test_size=0.3):
    accs = []
    for _ in range(n_rounds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accs.append(accuracy_score(y_test, y_pred))
    avg_acc = np.mean(accs)
    variance = np.var(accs)
    bias = 1 - avg_acc
    return bias, variance, avg_acc

# 1. California Housing (Regression)
california = fetch_california_housing()
X_cal, y_cal = california.data, california.target
lr = LinearRegression()
bias_cal, var_cal, mse_cal = bias_variance_decomp_reg(lr, X_cal, y_cal)
print(f"California Housing (Linear Regression):\n  Bias: {bias_cal:.4f}\n  Variance: {var_cal:.4f}\n  MSE: {mse_cal:.4f}\n")

# 2. Iris (Classification)
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
clf = LogisticRegression(max_iter=200)
bias_iris, var_iris, acc_iris = bias_variance_decomp_clf(clf, X_iris, y_iris)
print(f"Iris (Logistic Regression):\n  Bias: {bias_iris:.4f}\n  Variance: {var_iris:.4f}\n  Accuracy: {acc_iris:.4f}\n")

# 3. Randomly generated data (Regression)
np.random.seed(0)
X_rand = np.random.rand(200, 3)
y_rand = 5 * X_rand[:, 0] - 2 * X_rand[:, 1] + 3 * X_rand[:, 2] + np.random.randn(200)
dt = DecisionTreeRegressor()
bias_rand, var_rand, mse_rand = bias_variance_decomp_reg(dt, X_rand, y_rand)
print(f"Random Data (Decision Tree Regression):\n  Bias: {bias_rand:.4f}\n  Variance: {var_rand:.4f}\n  MSE: {mse_rand:.4f}\n")

California Housing (Linear Regression):
  Bias: 0.0141
  Variance: 0.9702
  MSE: 0.7045

Iris (Logistic Regression):
  Bias: 0.0396
  Variance: 0.0006
  Accuracy: 0.9604

Random Data (Decision Tree Regression):
  Bias: 0.0537
  Variance: 5.0691
  MSE: 2.5090

