In [2]:
# Objective: Perform regression using Linear Regression with K-Fold Cross Validation

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import warnings

# Ignore warnings for clean output
warnings.filterwarnings("ignore")

# --- 1. Load Dataset ---
print("Loading dataset...")
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name="MedHouseVal")

print(f"Dataset loaded successfully with shape: {X.shape}")
print(f"Features: {list(X.columns)}")

# --- 2. Data Cleaning ---
print("\n--- Data Cleaning ---")
print(f"Missing values per column:\n{X.isnull().sum()}")
# Dataset has no missing values.

# --- 3. Feature Engineering ---
print("\n--- Feature Engineering ---")
X['Rooms_per_Household'] = X['AveRooms'] / X['AveOccup']
X['Bedrms_per_Room'] = X['AveBedrms'] / X['AveRooms']
print("Added new features: 'Rooms_per_Household' and 'Bedrms_per_Room'")

# --- 4. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# --- 5. Model Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# --- 6. K-Fold Cross-Validation ---
print("\n--- K-Fold Cross-Validation ---")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
neg_mse_scores = cross_val_score(
    pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error'
)

# Convert to RMSE
rmse_scores = np.sqrt(-neg_mse_scores)
print(f"RMSE for each fold: {np.round(rmse_scores, 4)}")
print(f"Average K-Fold RMSE: {np.mean(rmse_scores):.4f}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores):.4f}")

# --- 7. Final Model Training & Evaluation ---
print("\n--- Final Model Training & Test Evaluation ---")
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Final Test RMSE: {final_rmse:.4f}")
print(f"(K-Fold Avg RMSE ≈ {np.mean(rmse_scores):.4f}) → close match ✅")

print("\n✅ End-to-End ML Project Completed Successfully!")


Loading dataset...
Dataset loaded successfully with shape: (20640, 8)
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

--- Data Cleaning ---
Missing values per column:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

--- Feature Engineering ---
Added new features: 'Rooms_per_Household' and 'Bedrms_per_Room'

Training samples: 16512
Testing samples: 4128

--- K-Fold Cross-Validation ---
RMSE for each fold: [0.6838 0.6861 0.6451 0.6808 0.713 ]
Average K-Fold RMSE: 0.6817
Standard Deviation of RMSE: 0.0216

--- Final Model Training & Test Evaluation ---
Final Test RMSE: 0.6753
(K-Fold Avg RMSE ≈ 0.6817) → close match ✅

✅ End-to-End ML Project Completed Successfully!
