In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

# **Model Building**

In [4]:
from model_collection import LinearRegression, RidgeRegression, XGBoostRegressor

In [5]:
data=pd.read_csv("house_data.csv")

In [6]:
X = data.drop(columns=["Price"])
y = data["Price"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
results = {}

## **Linear Regression**

In [11]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

LinearRegression()

In [12]:
y_pred_lr = lr.predict(X_test_scaled)

In [13]:
rmse = root_mean_squared_error(y_test, y_pred_lr)
mae = mean_absolute_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

In [14]:
results["Linear Regression"] = [rmse, mae, r2]

In [15]:
print(f"Linear Regression â€” RMSE: {rmse:.3f}, MAE: {mae:.3f}, RÂ²: {r2:.3f}")

Linear Regression â€” RMSE: 0.856, MAE: 0.439, RÂ²: 0.306


## **Ridge Regression**

In [17]:
ridge = RidgeRegression(alpha=1.0)  
ridge.fit(X_train_scaled, y_train)


RidgeRegression(alpha=1.0)

In [18]:
y_pred_ridge = ridge.predict(X_test_scaled)

In [19]:
rmse_ridge = root_mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

In [20]:
results["Ridge Regression"] = [rmse_ridge, mae_ridge, r2_ridge]

In [21]:
print(f"Ridge Regression â€” RMSE: {rmse_ridge:.3f}, MAE: {mae_ridge:.3f}, RÂ²: {r2_ridge:.3f}")

Ridge Regression â€” RMSE: 0.856, MAE: 0.439, RÂ²: 0.306


## XGBoost Regression

In [23]:
class MeanSquaredError:
    def loss(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    def gradient(self, y_true, y_pred):
        return 2 * (y_pred - y_true)

    def hessian(self, y_true, y_pred):
        return np.full_like(y_true, 2.0)


In [24]:
params = {
    "learning_rate": 0.1,
    "max_depth": 5,
    "subsample": 0.8,
    "reg_lambda": 1.0,
    "gamma": 0.0,
    "min_child_weight": 1.0,
    "colsample_bynode": 1.0,
    "base_score": 0.5
}


In [25]:
# Initialize model
model = XGBoostRegressor(params=params, random_seed=42)

# Fit the model
model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=30, verbose=True)

# Predict
y_pred = model.predict(X_test)

  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


[0] train loss = 1.0869986270508472
[1] train loss = 0.9708924941629765
[2] train loss = 0.8764897978237663
[3] train loss = 0.7986736751426822
[4] train loss = 0.7320713650907539
[5] train loss = 0.6773578186816456
[6] train loss = 0.6313764047931072
[7] train loss = 0.5941535498509722
[8] train loss = 0.5538591159689683
[9] train loss = 0.5280059479928686
[10] train loss = 0.5057025194036813
[11] train loss = 0.4855051689198135
[12] train loss = 0.4632457260447055
[13] train loss = 0.44527049547099595
[14] train loss = 0.43394779584599236
[15] train loss = 0.41955151947473074
[16] train loss = 0.4104480310279021
[17] train loss = 0.3977522825541541
[18] train loss = 0.3889146050028861
[19] train loss = 0.3818287594005502
[20] train loss = 0.37379641717814865
[21] train loss = 0.36727058124289297
[22] train loss = 0.36234460721691164
[23] train loss = 0.3582083993597897
[24] train loss = 0.35269867366133983
[25] train loss = 0.3478352990821348
[26] train loss = 0.3431677683935812
[27]

In [26]:
rmse_xg = mean_squared_error(y_test, y_pred, squared=False)
mae_xg = mean_absolute_error(y_test, y_pred)
r2_xg = r2_score(y_test, y_pred)

print(f"RMSE: {rmse_xg:.3f}, MAE: {mae_xg:.3f}, RÂ²: {r2_xg:.3f}")


RMSE: 0.731, MAE: 0.352, RÂ²: 0.494




## Cross Validation

In [28]:
from itertools import product

param_grid = {
    'learning_rate': [0.3, 0.2],
    'max_depth': [7, 9], 
    'reg_lambda': [0.5, 0.1], 
    'min_child_weight': [1, 5], 
}


In [29]:
# Create all combinations of parameters
grid = list(product(
    param_grid['learning_rate'],
    param_grid['max_depth'],
    param_grid['reg_lambda'],
    param_grid['min_child_weight']
))


In [30]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


In [31]:
best_r2 = float('-inf')
best_params = None
best_feature_importance = None

for lr, depth, reg, child_weight in grid:
    params = {
        'learning_rate': lr,
        'max_depth': depth,
        'reg_lambda': reg,
        'min_child_weight': child_weight,
        'colsample_bynode': 1.0,  # Optional
    }

    model =  XGBoostRegressor(params, random_seed=42)
    model.fit(X_train_split, y_train_split, objective=MeanSquaredError(), num_boost_round=50)
    y_val_pred = model.predict(X_val_split)

    r2 = r2_score(y_val_split, y_val_pred)
    print(f"lr={lr}, depth={depth}, lambda={reg}, child={child_weight} => RÂ² = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_params = params.copy()
        best_feature_importance = model.feature_importance_.copy()


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=7, lambda=0.5, child=1 => RÂ² = 0.7205


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=7, lambda=0.5, child=5 => RÂ² = 0.7091


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=7, lambda=0.1, child=1 => RÂ² = 0.7054


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=7, lambda=0.1, child=5 => RÂ² = 0.7028


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=9, lambda=0.5, child=1 => RÂ² = 0.7234


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=9, lambda=0.5, child=5 => RÂ² = 0.7730


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=9, lambda=0.1, child=1 => RÂ² = 0.7237


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.3, depth=9, lambda=0.1, child=5 => RÂ² = 0.7418


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=7, lambda=0.5, child=1 => RÂ² = 0.7168


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=7, lambda=0.5, child=5 => RÂ² = 0.7130


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=7, lambda=0.1, child=1 => RÂ² = 0.6922


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=7, lambda=0.1, child=5 => RÂ² = 0.7209


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=9, lambda=0.5, child=1 => RÂ² = 0.7342


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=9, lambda=0.5, child=5 => RÂ² = 0.7353


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=9, lambda=0.1, child=1 => RÂ² = 0.7327


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.2, depth=9, lambda=0.1, child=5 => RÂ² = 0.7314


In [32]:
# Train best model on full training set
final_model = XGBoostRegressor(params=best_params, random_seed=42)
final_model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=50)

# Predict on test set
y_pred = final_model.predict(X_test)

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse_xgb = mean_squared_error(y_test, y_pred, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

print("\nFinal Test Results:")
print(f"Best Params: {best_params}")
print(f"RMSE: {rmse_xgb:.3f}, MAE: {mae_xgb:.3f}, RÂ²: {r2_xgb:.3f}")


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right



ðŸŽ¯ Final Test Results:
Best Params: {'learning_rate': 0.3, 'max_depth': 9, 'reg_lambda': 0.5, 'min_child_weight': 5, 'colsample_bynode': 1.0}
RMSE: 0.517, MAE: 0.228, RÂ²: 0.747




In [33]:
results["XGBoost (Tuned):"] = [rmse_xgb, mae_xgb, r2_xgb]

In [34]:
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "RÂ²"]).T.round(3)

print("\nModel Comparison:")
print(df_results)


Model Comparison:
                    RMSE    MAE     RÂ²
Linear Regression  0.856  0.439  0.306
Ridge Regression   0.856  0.439  0.306
XGBoost (Tuned):   0.517  0.228  0.747


In [35]:
# Ensure feature names are known
feature_names = X_train.columns  # Adjust if needed

# Convert importance dict (index-based) to feature name mapping
importance_named = {
    feature_names[k]: v for k, v in best_feature_importance.items()
}

# Sort by importance (highest first)
sorted_importance = sorted(importance_named.items(), key=lambda item: item[1], reverse=True)

# Print
print("\nTop Features by Importance (Total Gain):")
for i, (feature, score) in enumerate(sorted_importance, 1):
    print(f"{i}. {feature:<25} : {score:.4f}")



ðŸ“Š Top Features by Importance (Total Gain):
1. Area                      : 108378.9497
2. Bathroom                  : 27620.6181
3. Bedroom                   : 25223.7733
4. Floors                    : 18324.5101
5. Road_Width                : 17253.8549
6. Distance_mainroad_km      : 16942.1747
7. Property_Age              : 13748.2888
8. Amenities_Count           : 11153.3681
9. City_Pokhara              : 3192.9232
10. City_Lalitpur             : 1393.5485
11. Road_Type_Paved           : 975.5454
12. City_Kathmandu            : 546.3616
13. Road_Type_Soil Stabilized : 536.6800
14. Neighborhood_freq         : 467.3946
15. Furnishing_Unfurnished    : 375.3582
16. Furnishing_Fully furnished : 284.5736
17. Road_Type_Gravelled       : 225.1904
18. Furnishing_Semi-furnished : 217.7249
19. City_Bhaktapur            : 64.9131
20. City_Chitwan              : 47.9111


# **Saving Model**

In [75]:
import pickle

# Bundle everything into a dictionary
model_package = {
    "model": final_model,
    "best_params": best_params,
    "feature_importance": best_feature_importance
}

# Save to file with your desired name
with open("house_price.pkl", "wb") as f:
    pickle.dump(model_package, f)

print("Model, parameters, and feature importance saved as house_price.pkl.")


AttributeError: Can't pickle local object 'XGBoostRegressor.__init__.<locals>.<lambda>'