In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# **Model Building**

In [45]:
from model_collection import LinearRegression, RidgeRegression, XGBoostRegressor

In [46]:
data=pd.read_csv("house_data.csv")

In [47]:
X = data.drop(columns=["Price"])
y = data["Price"]

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [50]:
results = {}

## **Linear Regression**

In [52]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train_log)

LinearRegression()

In [53]:
y_pred_lr = lr.predict(X_test_scaled)

In [54]:
rmse = root_mean_squared_error(y_test_log, y_pred_lr)
mae = mean_absolute_error(y_test_log, y_pred_lr)
r2 = r2_score(y_test_log, y_pred_lr)

In [55]:
results["Linear Regression"] = [rmse, mae, r2]

In [56]:
print(f"Linear Regression — RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

Linear Regression — RMSE: 1.273, MAE: 0.748, R²: 0.421


## **Ridge Regression**

In [58]:
ridge = RidgeRegression(alpha=1.0)  
ridge.fit(X_train_scaled, y_train_log)


RidgeRegression(alpha=1.0)

In [59]:
y_pred_ridge = ridge.predict(X_test_scaled)

In [60]:
rmse_ridge = root_mean_squared_error(y_test_log, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test_log, y_pred_ridge)
r2_ridge = r2_score(y_test_log, y_pred_ridge)

In [61]:
results["Ridge Regression"] = [rmse_ridge, mae_ridge, r2_ridge]

In [62]:
print(f"Ridge Regression — RMSE: {rmse_ridge:.3f}, MAE: {mae_ridge:.3f}, R²: {r2_ridge:.3f}")

Ridge Regression — RMSE: 1.272, MAE: 0.751, R²: 0.422


## XGBoost Regression

In [64]:
class MeanSquaredError:
    def loss(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    def gradient(self, y_true, y_pred):
        return 2 * (y_pred - y_true)

    def hessian(self, y_true, y_pred):
        return np.full_like(y_true, 2.0)


In [65]:
params = {
    "learning_rate": 0.1,
    "max_depth": 5,
    "subsample": 0.8,
    "reg_lambda": 1.0,
    "gamma": 0.0,
    "min_child_weight": 1.0,
    "colsample_bynode": 1.0,
    "base_score": 0.5
}


In [66]:
# Initialize model
model = XGBoostRegressor(params=params, random_seed=42)

# Fit the model
model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=30, verbose=True)

# Predict
y_pred = model.predict(X_test)

  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


[0] train loss = 1849418601045118.2
[1] train loss = 1541324430690183.8
[2] train loss = 1293849179045754.0
[3] train loss = 1090870865458471.9
[4] train loss = 928041661873180.4
[5] train loss = 795055826344439.1
[6] train loss = 684839239336238.0
[7] train loss = 593887986867274.0
[8] train loss = 519589397929973.4
[9] train loss = 459952682652591.94
[10] train loss = 409923397689845.2
[11] train loss = 368137671009084.94
[12] train loss = 334961595709049.2
[13] train loss = 308052102690052.3
[14] train loss = 284829259885125.56
[15] train loss = 263133573931749.53
[16] train loss = 247341143549512.94
[17] train loss = 233439817195526.22
[18] train loss = 222551745424720.94
[19] train loss = 212145528524863.22
[20] train loss = 204932270163526.4
[21] train loss = 198220346491703.44
[22] train loss = 192430961847823.47
[23] train loss = 187226745150254.97
[24] train loss = 182735087027094.44
[25] train loss = 178309803539712.2
[26] train loss = 174491648560483.16
[27] train loss = 171

In [67]:
rmse_xg = mean_squared_error(y_test, y_pred, squared=False)
mae_xg = mean_absolute_error(y_test, y_pred)
r2_xg = r2_score(y_test, y_pred)

print(f"RMSE: {rmse_xg:.3f}, MAE: {mae_xg:.3f}, R²: {r2_xg:.3f}")


RMSE: 12997664.760, MAE: 6340463.189, R²: 0.652




## Cross Validation

In [69]:
from itertools import product

param_grid = {
    'learning_rate': [0.05, 0.15],  
    'max_depth': [4, 5],            
    'reg_lambda': [1.0, 1.5],       
    'min_child_weight': [2, 3],     
}

In [70]:
# Create all combinations of parameters
grid = list(product(
    param_grid['learning_rate'],
    param_grid['max_depth'],
    param_grid['reg_lambda'],
    param_grid['min_child_weight']
))


In [71]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


In [72]:
best_r2 = float('-inf')
best_params = None
best_feature_importance = None

for lr, depth, reg, child_weight in grid:
    params = {
        'learning_rate': lr,
        'max_depth': depth,
        'reg_lambda': reg,
        'min_child_weight': child_weight,
        'colsample_bynode': 1.0,
        'gamma': 0.0,
        'subsample': 1.0,
        'base_score': 0.5
    }

    model =  XGBoostRegressor(params, random_seed=42)
    model.fit(X_train_split, y_train_split, objective=MeanSquaredError(), num_boost_round=50)
    y_val_pred = model.predict(X_val_split)

    r2 = r2_score(y_val_split, y_val_pred)
    print(f"lr={lr}, depth={depth}, lambda={reg}, child={child_weight} => R² = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_params = params.copy()
        best_feature_importance = model.feature_importance_.copy()


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.0, child=2 => R² = 0.4439


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.0, child=3 => R² = 0.4821


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.5, child=2 => R² = 0.4486


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


KeyboardInterrupt: 

In [None]:
# Train best model on full training set
final_model = XGBoostRegressor(params=best_params, random_seed=42)
final_model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=50)

# Predict on test set
y_pred = final_model.predict(X_test)

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse_xgb = mean_squared_error(y_test, y_pred, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

print("\nFinal Test Results:")
print(f"Best Params: {best_params}")
print(f"RMSE: {rmse_xgb:.3f}, MAE: {mae_xgb:.3f}, R²: {r2_xgb:.3f}")


In [None]:
results["XGBoost (Tuned):"] = [rmse_xgb, mae_xgb, r2_xgb]

In [None]:
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "R²"]).T.round(3)

print("\nModel Comparison:")
print(df_results)

In [None]:
# Ensure feature names are known
feature_names = X_train.columns  # Adjust if needed

# Convert importance dict (index-based) to feature name mapping
importance_named = {
    feature_names[k]: v for k, v in best_feature_importance.items()
}

# Sort by importance (highest first)
sorted_importance = sorted(importance_named.items(), key=lambda item: item[1], reverse=True)

# Print
print("\nTop Features by Importance (Total Gain):")
for i, (feature, score) in enumerate(sorted_importance, 1):
    print(f"{i}. {feature:<25} : {score:.4f}")


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def permutation_importance(model, X, y, metric=mean_squared_error, n_repeats=5):
    baseline_preds = model.predict(X)
    baseline_score = metric(y, baseline_preds)

    importances = {}
    for col in X.columns:
        scores = []
        for _ in range(n_repeats):
            X_shuffled = X.copy()
            X_shuffled[col] = np.random.permutation(X_shuffled[col])
            preds = model.predict(X_shuffled)
            score = metric(y, preds)
            scores.append(score - baseline_score)
        importances[col] = np.mean(scores)
    
    return pd.Series(importances).sort_values(ascending=False)

# Example
importances = permutation_importance(final_model, X_test, y_test_log)
print(importances)


# **Saving Model**

In [None]:
# import pickle

# # Bundle everything into a dictionary
# model_package = {
#     "model": final_model,
#     "best_params": best_params,
#     "feature_importance": best_feature_importance
# }

# # Save to file with your desired name
# with open("house_price.pkl", "wb") as f:
#     pickle.dump(model_package, f)

# print("Model, parameters, and feature importance saved as house_price.pkl.")
