In [9]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# Load the California housing dataset
data = fetch_california_housing(as_frame=True)
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the number of bins and the binning feature
num_bins = 5
binning_feature = 'AveRooms'  # Average number of rooms

# Handle missing values
imputer = SimpleImputer()
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Calculate bin boundaries based on the feature values
bin_boundaries = np.linspace(X_train[binning_feature].min(), X_train[binning_feature].max(), num_bins+1)

# Perform binning on the training set
X_train_binned = pd.cut(X_train[binning_feature], bins=bin_boundaries, labels=False, include_lowest=True)

# Perform binning on the testing set
X_test_binned = pd.cut(X_test[binning_feature], bins=bin_boundaries, labels=False, include_lowest=True)

# Train a linear regression model on the original data
model_original = LinearRegression()
model_original.fit(X_train, y_train)

# Make predictions on the original test set
y_pred_original = model_original.predict(X_test)

# Train a linear regression model on the binned data
model_binned = LinearRegression()
model_binned.fit(X_train_binned.values.reshape(-1, 1), y_train)

# Make predictions on the binned test set
y_pred_binned = model_binned.predict(X_test_binned.values.reshape(-1, 1))

# Calculate mean squared error for both models
mse_original = mean_squared_error(y_test, y_pred_original)
mse_binned = mean_squared_error(y_test, y_pred_binned)

print("MSE before binning:", mse_original)
print("MSE after binning:", mse_binned)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values