In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_regression

# Function to remove duplicates from the dataset
def remove_duplicates(df):
    """ Removes duplicate entries in the dataframe """
    return df.drop_duplicates()

# Function for K-Fold Cross Validation
def cross_validate(model, X, y, k=5):
    """ Perform K-Fold Cross Validation and return list of MSE for each fold """
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_list = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Predict and calculate MSE for this fold
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_list.append(mse)

    return np.array(mse_list)

# Function to calculate Bias and Variance
def calculate_bias_variance(model, X, y, mse_list):
    """ Calculate Bias and Variance of the model """
    # Bias: error between predicted and true values across all samples
    y_pred_all = model.predict(X)
    bias = np.mean((y_pred_all - y) ** 2)

    # Variance: variance of the errors over different folds
    variance = np.var(mse_list)

    return bias, variance

# Generate a synthetic regression dataset (1000 samples, 5 features)
X, y = make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)

# Create a DataFrame and remove duplicates
df = pd.DataFrame(X)
df['target'] = y
df = remove_duplicates(df)

# Split the cleaned dataset into features (X) and target (y)
X_clean = df.drop(columns=['target']).values
y_clean = df['target'].values

# Create a linear regression model
model = LinearRegression()

# Perform K-Fold Cross-Validation
mse_list = cross_validate(model, X_clean, y_clean, k=5)

# Calculate Bias and Variance
bias, variance = calculate_bias_variance(model, X_clean, y_clean, mse_list)

# Output the results
print(f"Mean Squared Error across folds: {mse_list}")
print(f"Bias: {bias}")
print(f"Variance: {variance}")


Mean Squared Error across folds: [0.01109061 0.01092185 0.01058418 0.00967292 0.01158196]
Bias: 0.010705117089238676
Variance: 4.0464547414812193e-07
