In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# **Model Building**

In [4]:
from model_collection import LinearRegression, RidgeRegression, XGBoostRegressor

In [5]:
data=pd.read_csv("house_data.csv")

In [95]:
import pandas as pd

# Suppose your dataset is in a DataFrame called `data`

# Show min and max for each numeric column
print(data.describe().T[["min", "max"]])


                                   min           max
Price                      15000.00000  2.350000e+08
Bedroom                        0.00000  9.600000e+01
Bathroom                       0.00000  4.000000e+01
Floors                         1.00000  1.300000e+01
Area                           2.00000  2.000000e+02
Road_Width                     0.00000  1.640420e+03
Amenities_Count                0.00000  3.000000e+01
Property_Age                   4.00000  8.800000e+01
City_Bhaktapur                 0.00000  1.000000e+00
City_Chitwan                   0.00000  1.000000e+00
City_Dharan                    0.00000  1.000000e+00
City_Jhapa                     0.00000  1.000000e+00
City_Kathmandu                 0.00000  1.000000e+00
City_Lalitpur                  0.00000  1.000000e+00
City_Nawalparasi               0.00000  1.000000e+00
City_Pokhara                   0.00000  1.000000e+00
Road_Type_Gravelled            0.00000  1.000000e+00
Road_Type_Paved                0.00000  1.0000

In [6]:
X = data.drop(columns=["Price"])
y = data["Price"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [9]:
results = {}

## **Linear Regression**

In [11]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train_log)

LinearRegression()

In [12]:
y_pred_lr = lr.predict(X_test_scaled)

In [13]:
rmse = root_mean_squared_error(y_test_log, y_pred_lr)
mae = mean_absolute_error(y_test_log, y_pred_lr)
r2 = r2_score(y_test_log, y_pred_lr)

In [14]:
results["Linear Regression"] = [rmse, mae, r2]

In [15]:
print(f"Linear Regression — RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

Linear Regression — RMSE: 1.273, MAE: 0.748, R²: 0.421


## **Ridge Regression**

In [17]:
ridge = RidgeRegression(alpha=1.0)  
ridge.fit(X_train_scaled, y_train_log)


RidgeRegression(alpha=1.0)

In [18]:
y_pred_ridge = ridge.predict(X_test_scaled)

In [19]:
rmse_ridge = root_mean_squared_error(y_test_log, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test_log, y_pred_ridge)
r2_ridge = r2_score(y_test_log, y_pred_ridge)

In [20]:
results["Ridge Regression"] = [rmse_ridge, mae_ridge, r2_ridge]

In [21]:
print(f"Ridge Regression — RMSE: {rmse_ridge:.3f}, MAE: {mae_ridge:.3f}, R²: {r2_ridge:.3f}")

Ridge Regression — RMSE: 1.272, MAE: 0.751, R²: 0.422


## XGBoost Regression

In [23]:
class MeanSquaredError:
    def loss(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    def gradient(self, y_true, y_pred):
        return 2 * (y_pred - y_true)

    def hessian(self, y_true, y_pred):
        return np.full_like(y_true, 2.0)


In [24]:
params = {
    "learning_rate": 0.1,
    "max_depth": 5,
    "subsample": 0.8,
    "reg_lambda": 1.0,
    "gamma": 0.0,
    "min_child_weight": 1.0,
    "colsample_bynode": 1.0,
    "base_score": 0.5
}


In [25]:
# Initialize model
model = XGBoostRegressor(params=params, random_seed=42)

# Fit the model
model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=30, verbose=True)

# Predict
y_pred = model.predict(X_test)

  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


[0] train loss = 1849418601045118.2
[1] train loss = 1541324430690183.8
[2] train loss = 1293849179045754.0
[3] train loss = 1090870865458471.9
[4] train loss = 928041661873180.4
[5] train loss = 795055826344439.1
[6] train loss = 684839239336238.0
[7] train loss = 593887986867274.0
[8] train loss = 519589397929973.4
[9] train loss = 459952682652591.94
[10] train loss = 409923397689845.2
[11] train loss = 368137671009084.94
[12] train loss = 334961595709049.2
[13] train loss = 308052102690052.3
[14] train loss = 284829259885125.56
[15] train loss = 263133573931749.53
[16] train loss = 247341143549512.94
[17] train loss = 233439817195526.22
[18] train loss = 222551745424720.94
[19] train loss = 212145528524863.22
[20] train loss = 204932270163526.4
[21] train loss = 198220346491703.44
[22] train loss = 192430961847823.47
[23] train loss = 187226745150254.97
[24] train loss = 182735087027094.44
[25] train loss = 178309803539712.2
[26] train loss = 174491648560483.16
[27] train loss = 171

In [26]:
rmse_xg = mean_squared_error(y_test, y_pred, squared=False)
mae_xg = mean_absolute_error(y_test, y_pred)
r2_xg = r2_score(y_test, y_pred)

print(f"RMSE: {rmse_xg:.3f}, MAE: {mae_xg:.3f}, R²: {r2_xg:.3f}")


RMSE: 12997664.760, MAE: 6340463.189, R²: 0.652




## Cross Validation

In [28]:
from itertools import product

param_grid = {
    'learning_rate': [0.05, 0.15],  
    'max_depth': [4, 5],            
    'reg_lambda': [1.0, 1.5],       
    'min_child_weight': [2, 3],     
}

In [29]:
# Create all combinations of parameters
grid = list(product(
    param_grid['learning_rate'],
    param_grid['max_depth'],
    param_grid['reg_lambda'],
    param_grid['min_child_weight']
))


In [30]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


In [31]:
best_r2 = float('-inf')
best_params = None
best_feature_importance = None

for lr, depth, reg, child_weight in grid:
    params = {
        'learning_rate': lr,
        'max_depth': depth,
        'reg_lambda': reg,
        'min_child_weight': child_weight,
        'colsample_bynode': 1.0,
        'gamma': 0.0,
        'subsample': 1.0,
        'base_score': 0.5
    }

    model =  XGBoostRegressor(params, random_seed=42)
    model.fit(X_train_split, y_train_split, objective=MeanSquaredError(), num_boost_round=50)
    y_val_pred = model.predict(X_val_split)

    r2 = r2_score(y_val_split, y_val_pred)
    print(f"lr={lr}, depth={depth}, lambda={reg}, child={child_weight} => R² = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_params = params.copy()
        best_feature_importance = model.feature_importance_.copy()


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.0, child=2 => R² = 0.4439


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.0, child=3 => R² = 0.4821


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.5, child=2 => R² = 0.4486


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.5, child=3 => R² = 0.4771


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.0, child=2 => R² = 0.4172


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.0, child=3 => R² = 0.4850


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.5, child=2 => R² = 0.4318


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.5, child=3 => R² = 0.4862


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=4, lambda=1.0, child=2 => R² = 0.4416


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=4, lambda=1.0, child=3 => R² = 0.5108


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=4, lambda=1.5, child=2 => R² = 0.4672


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=4, lambda=1.5, child=3 => R² = 0.5102


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=5, lambda=1.0, child=2 => R² = 0.4125


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=5, lambda=1.0, child=3 => R² = 0.4946


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=5, lambda=1.5, child=2 => R² = 0.4397


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.15, depth=5, lambda=1.5, child=3 => R² = 0.5046


In [32]:
# Train best model on full training set
final_model = XGBoostRegressor(params=best_params, random_seed=42)
final_model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=50)

# Predict on test set
y_pred = final_model.predict(X_test)

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse_xgb = mean_squared_error(y_test, y_pred, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

print("\nFinal Test Results:")
print(f"Best Params: {best_params}")
print(f"RMSE: {rmse_xgb:.3f}, MAE: {mae_xgb:.3f}, R²: {r2_xgb:.3f}")


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right



Final Test Results:
Best Params: {'learning_rate': 0.15, 'max_depth': 4, 'reg_lambda': 1.0, 'min_child_weight': 3, 'colsample_bynode': 1.0, 'gamma': 0.0, 'subsample': 1.0, 'base_score': 0.5}
RMSE: 13018590.674, MAE: 6196917.828, R²: 0.651




In [33]:
results["XGBoost (Tuned):"] = [rmse_xgb, mae_xgb, r2_xgb]

In [34]:
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "R²"]).T.round(3)

print("\nModel Comparison:")
print(df_results)


Model Comparison:
                           RMSE          MAE     R²
Linear Regression  1.273000e+00        0.748  0.421
Ridge Regression   1.272000e+00        0.751  0.422
XGBoost (Tuned):   1.301859e+07  6196917.828  0.651


In [35]:
# Ensure feature names are known
feature_names = X_train.columns  # Adjust if needed

# Convert importance dict (index-based) to feature name mapping
importance_named = {
    feature_names[k]: v for k, v in best_feature_importance.items()
}

# Sort by importance (highest first)
sorted_importance = sorted(importance_named.items(), key=lambda item: item[1], reverse=True)

# Print
print("\nTop Features by Importance (Total Gain):")
for i, (feature, score) in enumerate(sorted_importance, 1):
    print(f"{i}. {feature:<25} : {score:.4f}")



Top Features by Importance (Total Gain):
1. Area                      : 57773188509211197440.0000
2. Bathroom                  : 7706257251157779456.0000
3. Bedroom                   : 3538731300522487296.0000
4. Amenities_Count           : 3036293758224513024.0000
5. Road_Width                : 1937648923279048960.0000
6. Property_Age              : 1726229441208729600.0000
7. Floors                    : 1546046703076264704.0000
8. City_Kathmandu            : 238957176760806752.0000
9. City_Pokhara              : 233478952410439584.0000
10. Neighborhood_encoded      : 222816639704925792.0000
11. City_Lalitpur             : 171469687973035296.0000
12. Road_Type_Paved           : 80397551192648832.0000
13. Road_Type_Soil Stabilized : 45065921817683512.0000
14. Road_Type_Gravelled       : 20308879592509440.0000
15. City_Chitwan              : 10674767057460482.0000
16. City_Bhaktapur            : 9317400925287636.0000


In [83]:
X_train.columns


Index(['Bedroom', 'Bathroom', 'Floors', 'Area', 'Road_Width',
       'Amenities_Count', 'Property_Age', 'City_Bhaktapur', 'City_Chitwan',
       'City_Dharan', 'City_Jhapa', 'City_Kathmandu', 'City_Lalitpur',
       'City_Nawalparasi', 'City_Pokhara', 'Road_Type_Gravelled',
       'Road_Type_Paved', 'Road_Type_Soil Stabilized', 'Neighborhood_encoded'],
      dtype='object')

In [81]:
importance = final_model.feature_importance_

# Total sum of importance
total = sum(importance.values())

# Print normalized percentages with feature indices
for idx, score in sorted(importance.items(), key=lambda x: x[1], reverse=True):
    percent = (score / total) * 100
    print(f"Feature {idx}: {percent:.2f}%")


Feature 3: 71.86%
Feature 1: 9.32%
Feature 0: 6.58%
Feature 5: 4.47%
Feature 6: 2.28%
Feature 4: 1.96%
Feature 2: 1.84%
Feature 14: 0.42%
Feature 11: 0.31%
Feature 7: 0.28%
Feature 12: 0.23%
Feature 18: 0.22%
Feature 15: 0.12%
Feature 17: 0.07%
Feature 16: 0.05%
Feature 8: 0.01%


# **Saving Model**

In [38]:
# import pickle

# # Bundle everything into a dictionary
# model_package = {
#     "model": final_model,
#     "best_params": best_params,
#     "feature_importance": best_feature_importance
# }

# # Save to file with your desired name
# with open("house_price.pkl", "wb") as f:
#     pickle.dump(model_package, f)

# print("Model, parameters, and feature importance saved as house_price.pkl.")
