## House Price Prediction

### Import Libraries

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


### Load the data

In [3]:
# load the dataset
df = pd.read_csv(r'https://raw.githubusercontent.com/Venkatalakshmikottapalli/House-Price-Prediction/refs/heads/main/data/processed/house_price_prediction.csv')

# print the head of the data
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city_encoded,zip_encoded
0,-0.766375,-0.40473,-0.829364,-0.850818,-0.186052,0.008976,0,0,3,-0.5337,-0.672124,-0.538426,1.219118,36,62
1,-0.63485,-0.40473,-0.139546,-0.132879,-0.074128,-0.922823,0,0,4,0.228405,-0.672124,-0.165236,-0.827385,18,26
2,-0.281094,-0.40473,0.205363,-0.0477,-0.182779,-0.922823,0,0,4,-0.972878,1.677763,-0.267015,-0.827385,3,7
3,0.3085,0.723545,0.550272,-0.12071,-0.114265,-0.922823,0,0,4,-0.79204,1.207785,0.174028,1.205849,31,31
4,0.03638,-1.533005,-1.519181,-1.410566,-0.228546,-0.922823,0,0,3,-1.127883,-0.672124,-1.115175,1.20789,35,54


In [25]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'city_encoded', 'zip_encoded'],
      dtype='object')

### Split the dataset

In [4]:
# Define features and target
X = df.drop(columns=['price'])  
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Building

### Random Forest

In [19]:
# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest MSE: {mse_rf}")
print(f"Random Forest RMSE: {rmse_rf}")
print(f"Random Forest R²: {r2_rf}")


Random Forest MSE: 0.34107006281997493
Random Forest RMSE: 0.5840120399614848
Random Forest R²: 0.6653417940966102


### Gradient Boosting

In [20]:
# Initialize the model
gb_model = GradientBoostingRegressor(random_state=42)

# Fit the model on the training data
gb_model.fit(X_train, y_train)

# Predict on the test data
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting MSE: {mse_gb}")
print(f"Gradient Boosting RMSE: {rmse_gb}")
print(f"Gradient Boosting R²: {r2_gb}")


Gradient Boosting MSE: 0.3757460859154083
Gradient Boosting RMSE: 0.6129813095971266
Gradient Boosting R²: 0.6313176537747214


### XGB

In [21]:
# Initialize the model
xgb_model = xgb.XGBRegressor(random_state=42)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Predict on the test data
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost MSE: {mse_xgb}")
print(f"XGBoost RMSE: {rmse_xgb}")
print(f"XGBoost R²: {r2_xgb}")


XGBoost MSE: 0.3288292492704078
XGBoost RMSE: 0.5734363515425298
XGBoost R²: 0.6773524896921901


### Hyperparameter Tuning

### Final Model

In [6]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize the model with the best hyperparameters found from GridSearchCV
best_model = xgb.XGBRegressor(
    learning_rate=0.1,          # Best learning rate
    max_depth=7,                # Best max_depth
    n_estimators=100,           # Best number of estimators
    subsample=0.8,              # Best subsample ratio
    colsample_bytree=0.8,       # 
    random_state=42
)

# Fit the final model on the entire training data
best_model.fit(X_train, y_train)

# Make predictions with the best model
y_pred_xgb_tuned = best_model.predict(X_test)

# Round predictions to the nearest integer
y_pred_xgb_tuned = np.round(y_pred_xgb_tuned)

# Evaluate the tuned model
mse_xgb_tuned = mean_squared_error(y_test, y_pred_xgb_tuned)
rmse_xgb_tuned = np.sqrt(mse_xgb_tuned)
r2_xgb_tuned = r2_score(y_test, y_pred_xgb_tuned)

# Print evaluation metrics
print(f"Tuned XGBoost MSE: {mse_xgb_tuned}")
print(f"Tuned XGBoost RMSE: {rmse_xgb_tuned}")
print(f"Tuned XGBoost R²: {r2_xgb_tuned}")

Tuned XGBoost MSE: 0.30418850858127555
Tuned XGBoost RMSE: 0.5515328717141668
Tuned XGBoost R²: 0.7015300032592726


In [8]:
# Example new data (replace with your actual new data)
import pandas as pd
new_data = pd.DataFrame({
    'bedrooms': [3],
    'bathrooms': [2],
    'sqft_living': [1800],
    'sqft_lot': [5000],
    'floors': [2],
    'waterfront': [0],
    'view': [0],
    'condition': [3],
    'sqft_above': [1600],
    'sqft_basement': [200],
    'yr_built': [1995],
    'yr_renovated': [0],
    'zip_encoded': [98052]  
})

# Ensure new data only includes the relevant columns
input_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                  'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
                  'yr_built', 'yr_renovated', 'zip_encoded']

new_data = new_data[input_features]

# Make predictions on the new data
new_predictions = best_model.predict(new_data)

# Round predictions to the nearest integer
new_predictions = np.round(new_predictions).astype(int)

# Print the prediction
print(f"Predicted price for new data: {new_predictions[0]}")

Predicted price: 0.8098351955413818
Actual predicted price: [809.8352]
