In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

# --- 1. Use a Real-World Dataset ---
print("Loading dataset...")
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name="MedHouseVal")

# --- 2. Perform Data Cleaning ---
print("\n--- Data Cleaning ---")
# This dataset is pre-cleaned
print(f"Missing values per column:\n{X.isnull().sum()}")

# --- 3. Perform Feature Engineering ---
print("\n--- Feature Engineering ---")
# Create new features
X['Rooms_per_Household'] = X['AveRooms'] / X['AveOccup']
X['Bedrms_per_Room'] = X['AveBedrms'] / X['AveRooms']
print("Added new features 'Rooms_per_Household' and 'Bedrms_per_Room'.")

# --- 4. Split Data ---
# We still split into train and test sets.
# The K-fold validation will be performed on the *training set*.
# The *test set* is held back for a final, one-time evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining/Validation set size: {X_train.shape[0]} samples")
print(f"Final Hold-Out Test set size: {X_test.shape[0]} samples")

# --- 5. Create Model Pipeline ---
# This is crucial for cross-validation.
# We create a 'pipeline' that bundles the steps.
# This ensures that data is scaled *inside* each K-fold split,
# preventing data leakage (i.e., the training fold "seeing"
# the validation fold's statistics).
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# --- 6. Model Evaluation with K-Fold Cross-Validation ---
print("\n--- K-Fold Cross-Validation ---")
# Define the K-fold strategy (e.g., 5 folds)
# shuffle=True ensures the data is mixed before splitting
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Use cross_val_score to perform the K-fold validation.
# This function automatically:
# 1. Splits the data based on `kf`.
# 2. Fits the `pipeline` on the training part of the fold.
# 3. Scores the `pipeline` on the validation part of the fold.
# 4. Repeats for all 5 folds.
# We use 'neg_mean_squared_error' because scikit-learn scoring functions
# try to *maximize* a score. We want to *minimize* error, so we use
# the negative error, and then flip the sign back later.
neg_mse_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Convert the negative MSE scores back to positive RMSE
rmse_scores = np.sqrt(-neg_mse_scores)

print(f"K-Fold RMSE scores for each of the 5 folds: {np.round(rmse_scores, 4)}")
print(f"Average K-Fold RMSE (on training data): {np.mean(rmse_scores):.4f}")
print(f"Std. Deviation of K-Fold RMSE: {np.std(rmse_scores):.4f}")
print("\n(This average score is a robust estimate of our model's performance.)")

# --- 7. Final Model Training and Evaluation on Test Set ---
print("\n--- Final Model Training & Test Set Evaluation ---")
# Now that we have a good estimate of our model's performance,
# we train our *final* model (the pipeline) on the *entire* training set.
pipeline.fit(X_train, y_train)
print("Final model trained on all training data.")

# Finally, we evaluate this single, final model on the
# *held-back test set* (data the model has never seen).
y_pred = pipeline.predict(X_test)

final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)

print(f"\nModel Performance on *Final Test Set*:")
print(f"Root Mean Squared Error (RMSE): {final_rmse:.4f}")
print(f"(This confirms our K-Fold average: {np.mean(rmse_scores):.4f} is close to our final test score: {final_rmse:.4f})")
