In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# **Model Building**

In [8]:
from model_collection import LinearRegression, RidgeRegression, XGBoostRegressor

In [9]:
data=pd.read_csv("house_data.csv")

In [10]:
# # data is your training DataFrame used to build the final dataset
# print(data['Area'].dtype)
# print(data['Area'].isnull().sum(), "nulls")
# print("unique count:", data['Area'].nunique())
# print(data['Area'].describe())
# # check top repeated values (synthetic data might produce many identical area)
# print(data['Area'].value_counts().head(20))

In [11]:
# data.corr()['Price'].sort_values(ascending=False)


In [12]:
# import pandas as pd

# # Suppose your dataset is in a DataFrame called `data`

# # Show min and max for each numeric column
# print(data.describe().T[["min", "max"]])


In [13]:
# Feature engineering
data["Log_Area"] = np.log1p(data["Area"])
data["Area_x_RoadWidth"] = data["Area"] * data["Road_Width"]

In [14]:
# Define features (X) and target (y)
X = data.drop(columns=["Price"])
y = data["Price"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [17]:
results = {}

## **Linear Regression**

In [19]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train_log)

LinearRegression()

In [20]:
y_pred_lr_log = lr.predict(X_test_scaled)
y_pred_lr = np.expm1(y_pred_lr_log)   # back-transform

In [21]:
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)



In [22]:
results["Linear Regression"] = [rmse_lr, mae_lr, r2_lr]

In [23]:
print(f"Linear Regression — RMSE: {rmse_lr:.0f}, MAE: {mae_lr:.0f}, R²: {r2_lr:.3f}")

Linear Regression — RMSE: 50092831, MAE: 25320447, R²: 0.867


## **Ridge Regression**

In [25]:
ridge = RidgeRegression(alpha=10.0)
ridge.fit(X_train_scaled, y_train_log)


RidgeRegression(alpha=10.0)

In [26]:
y_pred_ridge_log = ridge.predict(X_test_scaled)
y_pred_ridge = np.expm1(y_pred_ridge_log)

In [27]:
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)



In [28]:
results["Ridge Regression"] = [rmse_ridge, mae_ridge, r2_ridge]

In [29]:
print(f"Ridge Regression — RMSE: {rmse_ridge:.0f}, MAE: {mae_ridge:.0f}, R²: {r2_ridge:.3f}")

Ridge Regression — RMSE: 50147459, MAE: 25247862, R²: 0.866


## XGBoost Regression

In [31]:
class MeanSquaredError:
    def loss(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)
    def gradient(self, y_true, y_pred):
        return 2 * (y_pred - y_true)
    def hessian(self, y_true, y_pred):
        return np.full_like(y_true, 2.0)

In [32]:
params = {
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "reg_lambda": 1.0,
    "gamma": 0.0,
    "min_child_weight": 1.0,
    "colsample_bynode": 1.0,
    "base_score": 0.5
}


In [33]:
# Initialize model
model = XGBoostRegressor(params=params, random_seed=42)

# Fit the model
model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=20, verbose=True)

# Predict
y_pred = model.predict(X_test)

  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


[0] train loss = 2.4803195900889464e+16
[1] train loss = 2.2444424387437664e+16
[2] train loss = 2.0317812241430944e+16
[3] train loss = 1.84006747497944e+16
[4] train loss = 1.6663752633800996e+16
[5] train loss = 1.5090256878582052e+16
[6] train loss = 1.3672558085131888e+16
[7] train loss = 1.2394154326179258e+16
[8] train loss = 1.123659255777702e+16
[9] train loss = 1.0187985544439922e+16
[10] train loss = 9241792511273654.0
[11] train loss = 8384485150210890.0
[12] train loss = 7608675456068638.0
[13] train loss = 6909909149811160.0
[14] train loss = 6275994275587313.0
[15] train loss = 5702091237225196.0
[16] train loss = 5181685222082487.0
[17] train loss = 4713575782939263.0
[18] train loss = 4288094853346708.0
[19] train loss = 3904539842641983.5


In [34]:
rmse_xg = mean_squared_error(y_test, y_pred, squared=False)
mae_xg = mean_absolute_error(y_test, y_pred)
r2_xg = r2_score(y_test, y_pred)

print(f"RMSE: {rmse_xg:.3f}, MAE: {mae_xg:.3f}, R²: {r2_xg:.3f}")


RMSE: 65174496.069, MAE: 41103943.079, R²: 0.774




## Cross Validation

In [36]:
from itertools import product

param_grid = {
    'learning_rate': [0.05, 0.1],  
    'max_depth': [4, 5],            
    'reg_lambda': [1.0, 1.2],       
    'min_child_weight': [2, 3],     
}

# param_grid = {
#     'learning_rate': [0.1, 0.2],  
#     'max_depth': [6, 7],            
#     'reg_lambda': [1.5, 2],       
#     'min_child_weight': [3, 4],     
# }


In [37]:
# Create all combinations of parameters
grid = list(product(
    param_grid['learning_rate'],
    param_grid['max_depth'],
    param_grid['reg_lambda'],
    param_grid['min_child_weight']
))


In [38]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


In [39]:
best_r2 = float('-inf')
best_params = None
best_feature_importance = None

for lr, depth, reg, child_weight in grid:
    params = {
        'learning_rate': lr,
        'max_depth': depth,
        'reg_lambda': reg,
        'min_child_weight': child_weight,
        'colsample_bynode': 1.0,
        'gamma': 0.0,
        'subsample': 1.0,
        'base_score': 0.5
    }

    model =  XGBoostRegressor(params, random_seed=42)
    model.fit(X_train_split, y_train_split, objective=MeanSquaredError(), num_boost_round=20)
    y_val_pred = model.predict(X_val_split)

    r2 = r2_score(y_val_split, y_val_pred)
    print(f"lr={lr}, depth={depth}, lambda={reg}, child={child_weight} => R² = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_params = params.copy()
        best_feature_importance = model.feature_importance_.copy()


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.0, child=2 => R² = 0.7689


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.0, child=3 => R² = 0.7690


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.2, child=2 => R² = 0.7683


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=4, lambda=1.2, child=3 => R² = 0.7684


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.0, child=2 => R² = 0.7693


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.0, child=3 => R² = 0.7691


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.2, child=2 => R² = 0.7686


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.05, depth=5, lambda=1.2, child=3 => R² = 0.7683


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=4, lambda=1.0, child=2 => R² = 0.9598


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=4, lambda=1.0, child=3 => R² = 0.9599


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=4, lambda=1.2, child=2 => R² = 0.9598


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=4, lambda=1.2, child=3 => R² = 0.9597


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=5, lambda=1.0, child=2 => R² = 0.9613


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=5, lambda=1.0, child=3 => R² = 0.9612


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=5, lambda=1.2, child=2 => R² = 0.9611


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right


lr=0.1, depth=5, lambda=1.2, child=3 => R² = 0.9613


In [40]:
# Train best model on full training set
final_model = XGBoostRegressor(params=best_params, random_seed=42)
final_model.fit(X_train, y_train, objective=MeanSquaredError(), num_boost_round=20)

# Predict on test set
y_pred = final_model.predict(X_test)

# Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse_xgb = mean_squared_error(y_test, y_pred, squared=False)
mae_xgb = mean_absolute_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

print("\nFinal Test Results:")
print(f"Best Params: {best_params}")
print(f"XGBoost Regressor - RMSE: {rmse_xgb:.3f}, MAE: {mae_xgb:.3f}, R²: {r2_xgb:.3f}")


  child = self.left if row[self.split_feature_idx] <= self.threshold else self.right



Final Test Results:
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'reg_lambda': 1.2, 'min_child_weight': 3, 'colsample_bynode': 1.0, 'gamma': 0.0, 'subsample': 1.0, 'base_score': 0.5}
XGBoost Regressor - RMSE: 25178048.738, MAE: 17253162.830, R²: 0.966




In [41]:
results["XGBoost (Tuned):"] = [rmse_xgb, mae_xgb, r2_xgb]

In [42]:
df_results = pd.DataFrame(results, index=["RMSE", "MAE", "R²"]).T.round(3)

print("\nModel Comparison:")
print(df_results)


Model Comparison:
                           RMSE           MAE     R²
Linear Regression  5.009283e+07  2.532045e+07  0.867
Ridge Regression   5.014746e+07  2.524786e+07  0.866
XGBoost (Tuned):   2.517805e+07  1.725316e+07  0.966


In [43]:
# Ensure feature names are known
feature_names = X_train.columns  # Adjust if needed

# Convert importance dict (index-based) to feature name mapping
importance_named = {
    feature_names[k]: v for k, v in best_feature_importance.items()
}

# Sort by importance (highest first)
sorted_importance = sorted(importance_named.items(), key=lambda item: item[1], reverse=True)

# Print
print("\nTop Features by Importance (Total Gain):")
for i, (feature, score) in enumerate(sorted_importance, 1):
    print(f"{i}. {feature:<25} : {score:.4f}")



Top Features by Importance (Total Gain):
1. Area                      : 5031605421459315359744.0000
2. Bathroom                  : 418459816166330335232.0000
3. Bedroom                   : 203675922946572419072.0000
4. City_Kathmandu            : 22188303994556739584.0000
5. Amenities_Count           : 13531976502862102528.0000
6. City_Chitwan              : 12350879330081271808.0000
7. City_Pokhara              : 1943943761839124992.0000
8. City_Lalitpur             : 1803584047500391936.0000
9. Property_Age              : 1678998986161899264.0000
10. Neighborhood_encoded      : 1069275006463179904.0000
11. Road_Width                : 954748393351346432.0000
12. Floors                    : 352856079705731008.0000
13. Area_x_RoadWidth          : 184188790098869696.0000
14. Road_Type_Soil Stabilized : 36154507302892048.0000
15. City_Bhaktapur            : 27182091177963008.0000
16. Road_Type_Gravelled       : 7754771057348688.0000
17. Road_Type_Paved           : 2367309744477624.0000


In [44]:
X_train.columns


Index(['Bedroom', 'Bathroom', 'Floors', 'Area', 'Road_Width',
       'Amenities_Count', 'Property_Age', 'City_Bhaktapur', 'City_Chitwan',
       'City_Dharan', 'City_Jhapa', 'City_Kathmandu', 'City_Lalitpur',
       'City_Nawalparasi', 'City_Pokhara', 'Road_Type_Gravelled',
       'Road_Type_Paved', 'Road_Type_Soil Stabilized', 'Neighborhood_encoded',
       'Log_Area', 'Area_x_RoadWidth'],
      dtype='object')

In [45]:
importance = final_model.feature_importance_

# Total sum of importance
total = sum(importance.values())

# Print normalized percentages with feature indices
for idx, score in sorted(importance.items(), key=lambda x: x[1], reverse=True):
    percent = (score / total) * 100
    print(f"Feature {idx}: {percent:.2f}%")


Feature 3: 88.00%
Feature 1: 7.23%
Feature 0: 3.66%
Feature 11: 0.43%
Feature 5: 0.27%
Feature 8: 0.26%
Feature 12: 0.05%
Feature 14: 0.03%
Feature 4: 0.02%
Feature 6: 0.02%
Feature 18: 0.02%
Feature 2: 0.00%
Feature 7: 0.00%
Feature 20: 0.00%
Feature 15: 0.00%


# **Saving Model**

In [47]:
import pickle

# Bundle everything into a dictionary
model_package = {
    "model": final_model,
    "best_params": best_params,
    "feature_importance": best_feature_importance,
    "feature_names": list(X_train.columns)
}

# Save to file with your desired name
with open("house_price.pkl", "wb") as f:
    pickle.dump(model_package, f)

print("Model, parameters, and feature importance saved as house_price.pkl.")


Model, parameters, and feature importance saved as house_price.pkl.
