In [1]:
# ==========================================
# 1. IMPORT LIBRARIES
# ==========================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# ==========================================
# 2. LOAD DATA
# ==========================================
url = "https://github.com/Patilanuj/ML-models/raw/refs/heads/main/Linear_Regression_California_Housing_Prices/housing.csv"
df = pd.read_csv(url)

In [3]:
# ==========================================
# 3. CREATE STRATIFICATION COLUMN
# ==========================================
df['income_cat'] = pd.cut(
    df['median_income'],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df["income_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [4]:
# ==========================================
# 4. SPLIT FEATURES & LABELS
# ==========================================
X_train = strat_train_set.drop(["median_house_value", "income_cat"], axis=1)
y_train = strat_train_set["median_house_value"].copy()

X_test = strat_test_set.drop(["median_house_value", "income_cat"], axis=1)
y_test = strat_test_set["median_house_value"].copy()

In [5]:
# ==========================================
# 5. PIPELINE (NUMERIC + CATEGORICAL)
# ==========================================
num_attribs = list(X_train.drop("ocean_proximity", axis=1))
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler())
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

In [18]:
# ==========================================
# 6. TRAIN MODELS
# ==========================================
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=20),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}


results = {}

for name, model in models.items():
    
    model.fit(X_train_prepared, y_train)
    
    train_pred = model.predict(X_train_prepared)
    test_pred = model.predict(X_test_prepared)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    
    gap = test_rmse - train_rmse
    
    # Simple logic for fit diagnosis
    if gap > 0.2 * train_rmse:
        status = "Overfitting"
    elif train_rmse > test_rmse * 1.2:
        status = "Underfitting"
    else:
        status = "Good Fit"
    
    print("======================================")
    print(name)
    print("Train RMSE:", round(train_rmse, 2))
    print("Test RMSE:", round(test_rmse, 2))
    print("Gap:", round(gap, 2))
    print("Model Status:", status)
    print("======================================\n")


Linear Regression
Train RMSE: 68941.14
Test RMSE: 67260.26
Gap: -1680.88
Model Status: Good Fit

Decision Tree
Train RMSE: 0.0
Test RMSE: 68604.93
Gap: 68604.93
Model Status: Overfitting

Random Forest
Train RMSE: 18324.32
Test RMSE: 47271.25
Gap: 28946.93
Model Status: Overfitting



In [19]:
# ==========================================
# 8. CROSS VALIDATION (Linear Regression)
# ==========================================
scores = cross_val_score(
    models["Linear Regression"],
    X_train_prepared,
    y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

In [22]:
print(scores)

[-5.19395495e+09 -4.25153879e+09 -4.54702548e+09 -4.75301555e+09
 -4.46122080e+09 -5.33711127e+09 -4.96234191e+09 -4.82060361e+09
 -4.47418682e+09 -4.98797673e+09]
