In [4]:
#Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor




In [5]:
# Load the two cleaned datasets
linear_df = pd.read_csv("Engineered/train_linear_ready.csv")
xgb_df = pd.read_csv("Engineered/train_xgb_ready.csv")

# Split: Features and target
X1 = linear_df.drop(columns="SalePrice")
y1 = linear_df["SalePrice"]

X2 = xgb_df.drop(columns="SalePrice")
y2 = xgb_df["SalePrice"]

# Train-test split (80% train, 20% test) for both (controlled random_state)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [11]:
# Initialize models
lr_model = LinearRegression()
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgbm_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
# n_estimators=100     Number of boosting rounds (i.e., how many trees are built)
# learning_rate=0.1    Shrinks the contribution of each tree; lower = slower learning but more stable
# random_state=42      Ensures reproducibility (same random splits and results each run)

# Evaluation function
def evaluate(name, model, X_train, y_train, X_test, y_test,feature_set):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    return {
        "Model": name,
        "FeatureSet": feature_set,
        "Train_RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "Test_RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "Train_R2": r2_score(y_train, y_train_pred),
        "Test_R2": r2_score(y_test, y_test_pred)
    }


# Train & collect results
results = []

# LinearRegression on X1 and X2
results.append(evaluate("LinearRegression", lr_model, X1_train, y1_train, X1_test, y1_test,"X1"))
results.append(evaluate("LinearRegression", lr_model, X2_train, y2_train, X2_test, y2_test,"X2"))

# XGBRegressor on X1 and X2
results.append(evaluate("XGBRegressor", xgb_model, X1_train, y1_train, X1_test, y1_test,"X1"))
results.append(evaluate("XGBRegressor", xgb_model, X2_train, y2_train, X2_test, y2_test,"X2"))

# LGBMRegressor on X1 and X2
results.append(evaluate("LGBMRegressor", lgbm_model, X1_train, y1_train, X1_test, y1_test,"X1"))
results.append(evaluate("LGBMRegressor", lgbm_model, X2_train, y2_train, X2_test, y2_test,"X2"))

# Step 7: Summary table
results_df = pd.DataFrame(results)
print(results_df)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 10
[LightGBM] [Info] Start training from score 181441.541952
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 10
[LightGBM] [Info] Start training from score 181441.541952
              Model FeatureSet    Train_RMSE     Test_RMSE  Train_R2   Test_R2
0  LinearRegression         X1  36942.399110  38556.853438  0.771191  0.806184
1  LinearRegression         X2  37774.633877  39358.543250  0.760766  0.798041
2      XGBRegressor         X1  10385.202550  30136.292008  0.981918  0.

In [14]:
from sklearn.model_selection import cross_val_score, KFold

# Define 5-fold cross-validation object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize CV result columns
results_df["CV_RMSE"] = None
results_df["CV_R2"] = None

# Map string name to actual model (each re-instantiated)
model_map = {
    "LinearRegression": LinearRegression(),
    "XGBRegressor": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "LGBMRegressor": LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Loop over each row and compute CV metrics
for i, row in results_df.iterrows():
    model_name = row["Model"]
    feature_set = row["FeatureSet"]
    
    # Pick correct features and target
    X = X1 if feature_set == "X1" else X2
    y = y1 if feature_set == "X1" else y2
    
    # Reinitialize model to prevent data leakage
    model = model_map[model_name]
    
    # Cross-validation R² scores (use scoring='r2')
    cv_r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    cv_rmse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_root_mean_squared_error')

    # Store mean results
    results_df.at[i, "CV_R2"] = cv_r2_scores.mean()
    results_df.at[i, "CV_RMSE"] = cv_rmse_scores.mean()

# Reorder columns for your layout preference
results_df = results_df[[
    "Model", "FeatureSet", 
    "Train_RMSE", "Test_RMSE", "CV_RMSE",
    "Train_R2", "Test_R2", "CV_R2"
]]

# Show table
print(results_df)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 10
[LightGBM] [Info] Start training from score 181441.541952
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 957
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 10
[LightGBM] [Info] Start training from score 179651.292808
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 950
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 10
[LightGBM] [Info] Start

In [15]:
# Save full evaluation results to CSV
results_df.to_csv("Model_Evaluation_Results.csv", index=False)
