In [None]:
from google.colab import files
import pandas as pd  # ✅ Fixed typo

# Upload the file (a dialog box will appear)
uploaded = files.upload()

# Get the uploaded filename (automatically detects)
uploaded_filename = next(iter(uploaded))  # ✅ Fixed typo

# Rename the file (optional)
new_filename = "houseprice_ml.csv"  # You can change this
!mv "{uploaded_filename}" "{new_filename}"  # ✅ Fixed quotes

# Read the file
house_price_data = pd.read_csv(new_filename)
print("File loaded successfully!")
print(house_price_data.head())

# Machine Learning Imports (corrected)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression  # ✅ Fixed case
from sklearn.metrics import mean_squared_error, r2_score

Saving houseprice ml.csv to houseprice ml.csv
File loaded successfully!
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [None]:
house_price_data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [None]:
house_price_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [None]:
house_price_data.dtypes

Unnamed: 0,0
price,int64
area,int64
bedrooms,int64
bathrooms,int64
stories,int64
mainroad,object
guestroom,object
basement,object
hotwaterheating,object
airconditioning,object


In [None]:
# List of columns to binarize
binary_cols = [
    'mainroad',
    'guestroom',
    'basement',
    'hotwaterheating',
    'airconditioning',
    'prefarea'
]

# Map Yes/No to 1/0
house_price_data[binary_cols] = house_price_data[binary_cols].apply(
    lambda col: col.map({'yes': 1, 'no': 0})
)

# Check result
print(house_price_data[binary_cols].head())


   mainroad  guestroom  basement  hotwaterheating  airconditioning  prefarea
0         1          0         0                0                1         1
1         1          0         0                0                1         0
2         1          0         1                0                0         1
3         1          0         1                0                1         1
4         1          1         1                0                1         0


In [None]:
# One-hot encode 'furnishingstatus'
house_price_data = pd.get_dummies(
    house_price_data,
    columns=['furnishingstatus'],
    drop_first=True  # Avoid dummy variable trap
)

# Check new columns
print(house_price_data.head())


      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_semi-furnished  furnishingstatus_unfurnished  
0                            False                         False  
1                       

In [None]:
house_price_data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus_semi-furnished',
       'furnishingstatus_unfurnished'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = house_price_data.drop(columns=['price'])  # replace 'price' with your target column name
y = house_price_data['price']

# Split data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (436, 13) (436,)
Test shape: (109, 13) (109,)


In [None]:
X_train

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
46,6000,3,2,4,1,0,0,0,1,1,0,False,False
93,7200,3,2,1,1,0,1,0,1,3,0,True,False
335,3816,2,1,1,1,0,1,0,1,2,0,False,False
412,2610,3,1,2,1,0,1,0,0,0,1,False,True
471,3750,3,1,2,1,0,0,0,0,0,0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,1,0,0,0,1,0,0,False,True
106,5450,4,2,1,1,0,1,0,1,0,1,True,False
270,4500,3,2,3,1,0,0,1,0,1,0,False,False
435,4040,2,1,1,1,0,0,0,0,0,0,False,True


In [None]:
X_test

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
316,5900,4,2,2,0,0,1,0,0,1,0,False,True
77,6500,3,2,3,1,0,0,0,1,0,1,False,False
360,4040,2,1,1,1,0,0,0,0,0,0,True,False
90,5000,3,1,2,1,0,0,0,1,0,0,True,False
493,3960,3,1,1,1,0,0,0,0,0,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,6000,4,1,2,1,0,1,0,0,2,0,True,False
357,6930,4,1,2,0,0,0,0,0,1,0,False,False
39,6000,4,2,4,1,0,0,0,1,1,0,True,False
54,6000,3,2,2,1,1,0,0,1,1,0,True,False


In [None]:
# Combine X_train and y_train into one DataFrame
training_house_price = pd.concat([X_train, y_train], axis=1)

# Check the combined DataFrame
print(training_house_price.head())


     area  bedrooms  bathrooms  stories  mainroad  guestroom  basement  \
46   6000         3          2        4         1          0         0   
93   7200         3          2        1         1          0         1   
335  3816         2          1        1         1          0         1   
412  2610         3          1        2         1          0         1   
471  3750         3          1        2         1          0         0   

     hotwaterheating  airconditioning  parking  prefarea  \
46                 0                1        1         0   
93                 0                1        3         0   
335                0                1        2         0   
412                0                0        0         1   
471                0                0        0         0   

     furnishingstatus_semi-furnished  furnishingstatus_unfurnished    price  
46                             False                         False  7525000  
93                              True  

In [None]:
training_house_price

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price
46,6000,3,2,4,1,0,0,0,1,1,0,False,False,7525000
93,7200,3,2,1,1,0,1,0,1,3,0,True,False,6300000
335,3816,2,1,1,1,0,1,0,1,2,0,False,False,3920000
412,2610,3,1,2,1,0,1,0,0,0,1,False,True,3430000
471,3750,3,1,2,1,0,0,0,0,0,0,False,True,3010000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,1,0,0,0,1,0,0,False,True,6755000
106,5450,4,2,1,1,0,1,0,1,0,1,True,False,6160000
270,4500,3,2,3,1,0,0,1,0,1,0,False,False,4340000
435,4040,2,1,1,1,0,0,0,0,0,0,False,True,3290000


In [None]:
# Q1 (25th percentile) and Q3 (75th percentile) of the 'price' column
Q1 = training_house_price['price'].quantile(0.25)
Q3 = training_house_price['price'].quantile(0.75)

print("Q1:", Q1)
print("Q3:", Q3)


Q1: 3498250.0
Q3: 5600000.0


In [None]:
def categorize_price(p):
    if p < Q1:
        return 'low'
    elif p < Q3:
        return 'medium'
    else:
        return 'high'

training_house_price['price_category'] = training_house_price['price'].apply(categorize_price)


In [None]:
 # 3. Apply categorization to y_test
price_category_test = y_test.apply(categorize_price)

# 4. Add to X_test
X_test = X_test.copy()
X_test['price_category'] = price_category_test

In [None]:
training_house_price


Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price,price_category
46,6000,3,2,4,1,0,0,0,1,1,0,False,False,7525000,high
93,7200,3,2,1,1,0,1,0,1,3,0,True,False,6300000,high
335,3816,2,1,1,1,0,1,0,1,2,0,False,False,3920000,medium
412,2610,3,1,2,1,0,1,0,0,0,1,False,True,3430000,low
471,3750,3,1,2,1,0,0,0,0,0,0,False,True,3010000,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,1,0,0,0,1,0,0,False,True,6755000,high
106,5450,4,2,1,1,0,1,0,1,0,1,True,False,6160000,high
270,4500,3,2,3,1,0,0,1,0,1,0,False,False,4340000,medium
435,4040,2,1,1,1,0,0,0,0,0,0,False,True,3290000,low


In [None]:
# Separate target and features
y_train = training_house_price['price']
training_house_price = training_house_price.drop(columns=['price'])


In [None]:
X_test

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price_category
316,5900,4,2,2,0,0,1,0,0,1,0,False,True,medium
77,6500,3,2,3,1,0,0,0,1,0,1,False,False,high
360,4040,2,1,1,1,0,0,0,0,0,0,True,False,medium
90,5000,3,1,2,1,0,0,0,1,0,0,True,False,high
493,3960,3,1,1,1,0,0,0,0,0,0,False,False,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,6000,4,1,2,1,0,1,0,0,2,0,True,False,high
357,6930,4,1,2,0,0,0,0,0,1,0,False,False,medium
39,6000,4,2,4,1,0,0,0,1,1,0,True,False,high
54,6000,3,2,2,1,1,0,0,1,1,0,True,False,high


In [None]:
training_house_price

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price_category
46,6000,3,2,4,1,0,0,0,1,1,0,False,False,high
93,7200,3,2,1,1,0,1,0,1,3,0,True,False,high
335,3816,2,1,1,1,0,1,0,1,2,0,False,False,medium
412,2610,3,1,2,1,0,1,0,0,0,1,False,True,low
471,3750,3,1,2,1,0,0,0,0,0,0,False,True,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,1,0,0,0,1,0,0,False,True,high
106,5450,4,2,1,1,0,1,0,1,0,1,True,False,high
270,4500,3,2,3,1,0,0,1,0,1,0,False,False,medium
435,4040,2,1,1,1,0,0,0,0,0,0,False,True,low


In [None]:
import pandas as pd

# One-hot encode price_category in training set
training_house_price_encoded = pd.get_dummies(training_house_price, columns=['price_category'])

# One-hot encode price_category in test set
X_test_encoded = pd.get_dummies(X_test, columns=['price_category'])

# Align columns so train and test have the same set of features
X_test_encoded = X_test_encoded.reindex(columns=training_house_price_encoded.columns, fill_value=0)


In [None]:
training_house_price_encoded

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price_category_high,price_category_low,price_category_medium
46,6000,3,2,4,1,0,0,0,1,1,0,False,False,True,False,False
93,7200,3,2,1,1,0,1,0,1,3,0,True,False,True,False,False
335,3816,2,1,1,1,0,1,0,1,2,0,False,False,False,False,True
412,2610,3,1,2,1,0,1,0,0,0,1,False,True,False,True,False
471,3750,3,1,2,1,0,0,0,0,0,0,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,1,0,0,0,1,0,0,False,True,True,False,False
106,5450,4,2,1,1,0,1,0,1,0,1,True,False,True,False,False
270,4500,3,2,3,1,0,0,1,0,1,0,False,False,False,False,True
435,4040,2,1,1,1,0,0,0,0,0,0,False,True,False,True,False


In [None]:
X_test_encoded

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price_category_high,price_category_low,price_category_medium
316,5900,4,2,2,0,0,1,0,0,1,0,False,True,False,False,True
77,6500,3,2,3,1,0,0,0,1,0,1,False,False,True,False,False
360,4040,2,1,1,1,0,0,0,0,0,0,True,False,False,False,True
90,5000,3,1,2,1,0,0,0,1,0,0,True,False,True,False,False
493,3960,3,1,1,1,0,0,0,0,0,0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,6000,4,1,2,1,0,1,0,0,2,0,True,False,True,False,False
357,6930,4,1,2,0,0,0,0,0,1,0,False,False,False,False,True
39,6000,4,2,4,1,0,0,0,1,1,0,True,False,True,False,False
54,6000,3,2,2,1,1,0,0,1,1,0,True,False,True,False,False


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# 1. Parameter grid for Decision Tree
param_grid = {
    'max_depth': [None, 3, 5, 7, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 2. Initialize model
dt = DecisionTreeRegressor(random_state=42)

# 3. GridSearchCV
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1
)
grid_search.fit(training_house_price_encoded, y_train)

# 4. Best model
best_dt = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# 5. Predictions
y_train_pred = best_dt.predict(training_house_price_encoded)
y_test_pred = best_dt.predict(X_test_encoded)

# 6. Evaluation function
def evaluate_model(y_true, y_pred, dataset_name):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    print(f"{dataset_name} → R²: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

# 7. Evaluate on both sets
evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")


Best Parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Train → R²: 0.8864, MSE: 350387077112.4096, RMSE: 591935.0278
Test → R²: 0.7998, MSE: 1011828512805.4253, RMSE: 1005896.8699


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# 1. Parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# 2. Initialize Random Forest
rf = RandomForestRegressor(random_state=42)

# 3. GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1
)
grid_search_rf.fit(training_house_price_encoded, y_train)

# 4. Best model
best_rf = grid_search_rf.best_estimator_
print("Best Parameters:", grid_search_rf.best_params_)

# 5. Predictions
y_train_pred = best_rf.predict(training_house_price_encoded)
y_test_pred = best_rf.predict(X_test_encoded)

# 6. Evaluation function
def evaluate_model(y_true, y_pred, dataset_name):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    print(f"{dataset_name} → R²: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

# 7. Evaluate on both sets
evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Train → R²: 0.9516, MSE: 149321224635.7845, RMSE: 386421.0458
Test → R²: 0.8172, MSE: 923742125012.0570, RMSE: 961115.0425


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# 1. Parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 2. Initialize XGBoost Regressor
xgb = XGBRegressor(random_state=42, objective='reg:squarederror', n_jobs=-1)

# 3. GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1
)
grid_search_xgb.fit(training_house_price_encoded, y_train)

# 4. Best model
best_xgb = grid_search_xgb.best_estimator_
print("Best Parameters:", grid_search_xgb.best_params_)

# 5. Predictions
y_train_pred = best_xgb.predict(training_house_price_encoded)
y_test_pred = best_xgb.predict(X_test_encoded)

# 6. Evaluation function
def evaluate_model(y_true, y_pred, dataset_name):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    print(f"{dataset_name} → R²: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

# 7. Evaluate on both sets
evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Train → R²: 0.9329, MSE: 206935195648.0000, RMSE: 454901.3032
Test → R²: 0.8043, MSE: 989091135488.0000, RMSE: 994530.6106


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# 1. Parameter grid for CatBoost
param_grid = {
    'iterations': [200, 500],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# 2. Initialize CatBoost (silent mode to avoid long output)
cat_model = CatBoostRegressor(
    random_state=42,
    verbose=0
)

# 3. GridSearchCV
grid_search_cat = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    n_jobs=-1
)
grid_search_cat.fit(training_house_price_encoded, y_train)

# 4. Best model
best_cat = grid_search_cat.best_estimator_
print("Best Parameters:", grid_search_cat.best_params_)

# 5. Predictions
y_train_pred = best_cat.predict(training_house_price_encoded)
y_test_pred = best_cat.predict(X_test_encoded)

# 6. Evaluation function
def evaluate_model(y_true, y_pred, dataset_name):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    print(f"{dataset_name} → R²: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

# 7. Evaluate on both sets
evaluate_model(y_train, y_train_pred, "Train")
evaluate_model(y_test, y_test_pred, "Test")


Best Parameters: {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.05}
Train → R²: 0.9219, MSE: 240813749575.6290, RMSE: 490727.7754
Test → R²: 0.8068, MSE: 976735410997.4202, RMSE: 988299.2517


SyntaxError: invalid character '²' (U+00B2) (ipython-input-4149640709.py, line 13)

# House Price Prediction — Model Performance Report

## Overview
We trained and evaluated four regression models on the house price dataset:

1. **DecisionTreeRegressor**
2. **RandomForestRegressor**
3. **XGBRegressor**
4. **CatBoostRegressor**

All models were tuned with **GridSearchCV**, trained on the processed training set, and evaluated on both train and test sets. Performance metrics include:

- **R²** (Coefficient of Determination)
- **MSE** (Mean Squared Error)
- **RMSE** (Root Mean Squared Error)

---

## Results Summary

| Model                  | Train R²  | Train MSE           | Train RMSE   | Test R²  | Test MSE           | Test RMSE   |
|------------------------|-----------|---------------------|--------------|----------|--------------------|-------------|
| DecisionTreeRegressor  | 0.8864    | 3.50 × 10¹¹         | 591,935.03   | 0.7998   | 1.01 × 10¹²        | 1,005,896.87|
| RandomForestRegressor  | **0.9516**| 1.49 × 10¹¹         | 386,421.05   | **0.8172**| **9.24 × 10¹¹**    | **961,115.04**|
| XGBRegressor           | 0.9329    | 2.07 × 10¹¹         | 454,901.30   | 0.8043   | 9.89 × 10¹¹        | 994,530.61  |
| CatBoostRegressor      | 0.9219    | 2.41 × 10¹¹         | 490,727.78   | 0.8068   | 9.77 × 10¹¹        | 988,299.25  |

---

## Observations

- **DecisionTreeRegressor** underperformed compared to ensemble methods, showing higher variance and lower accuracy.
- **RandomForestRegressor** achieved the **highest R² on the test set** (0.8172) and the **lowest test RMSE** (961,115), indicating the best generalization.
- **XGBRegressor** and **CatBoostRegressor** both performed well, slightly below Random Forest in test performance, but with strong consistency.
- Ensemble methods (Random Forest, XGBoost, CatBoost) clearly outperformed a single Decision Tree.

---

## Best Model

🏆 **RandomForestRegressor** is the best-performing model based on **highest Test R²** and **lowest Test RMSE**.


In [None]:
X_test.columns

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished',
       'price_category'],
      dtype='object')

In [None]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.48.1-py3-none-any.whl.metadata (9.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [None]:
import streamlit as st
import pandas as pd
import pickle
import os

# ------------------------------
# Helper: Load Model Safely
# ------------------------------
def load_model(file_path):
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            return pickle.load(f)
    else:
        st.error(f"Model file not found: {file_path}")
        return None

# ------------------------------
# Load trained models
# ------------------------------
decision_tree_model = load_model("decision_tree_model.pkl")
random_forest_model = load_model("random_forest_model.pkl")
xgb_model = load_model("xgb_model.pkl")
catboost_model = load_model("catboost_model.pkl")

models = {
    "Decision Tree": decision_tree_model,
    "Random Forest": random_forest_model,
    "XGBoost": xgb_model,
    "CatBoost": catboost_model
}

# ------------------------------
# Streamlit App
# ------------------------------
st.title("🏠 House Price Prediction App")

st.markdown("Provide house details and select a trained ML model to predict the price.")

# Numerical inputs
st.subheader("Numeric Features")
area = st.number_input("Area (sq ft)", min_value=0)
bedrooms = st.number_input("Bedrooms", min_value=0, step=1)
bathrooms = st.number_input("Bathrooms", min_value=0, step=1)
stories = st.number_input("Stories", min_value=0, step=1)
parking = st.number_input("Parking Spaces", min_value=0, step=1)

# Categorical inputs
st.subheader("Categorical Features (Yes / No)")
def yes_no_input(label):
    return 1 if st.selectbox(label, ["no", "yes"]) == "yes" else 0

mainroad = yes_no_input("Main Road")
guestroom = yes_no_input("Guest Room")
basement = yes_no_input("Basement")
hotwaterheating = yes_no_input("Hot Water Heating")
airconditioning = yes_no_input("Air Conditioning")
prefarea = yes_no_input("Preferred Area")

# Furnishing status (One-hot style)
st.subheader("Furnishing Status")
furnishing_semi = yes_no_input("Semi-furnished")
furnishing_unfurnished = yes_no_input("Unfurnished")

# Price category
st.subheader("Price Category")
price_category = st.selectbox("Select Price Category", ["low", "medium", "high"])
price_low = 1 if price_category == "low" else 0
price_medium = 1 if price_category == "medium" else 0
price_high = 1 if price_category == "high" else 0

# Model selection
model_choice = st.selectbox(
    "Select Model",
    list(models.keys())
)

# ------------------------------
# Prepare input dataframe
# ------------------------------
input_data = pd.DataFrame({
    "area": [area],
    "bedrooms": [bedrooms],
    "bathrooms": [bathrooms],
    "stories": [stories],
    "mainroad": [mainroad],
    "guestroom": [guestroom],
    "basement": [basement],
    "hotwaterheating": [hotwaterheating],
    "airconditioning": [airconditioning],
    "parking": [parking],
    "prefarea": [prefarea],
    "furnishingstatus_semi-furnished": [furnishing_semi],
    "furnishingstatus_unfurnished": [furnishing_unfurnished],
    "price_category_low": [price_low],
    "price_category_medium": [price_medium],
    "price_category_high": [price_high]
})

# ------------------------------
# Predict
# ------------------------------
if st.button("Predict Price"):
    model = models[model_choice]
    if model:
        try:
            prediction = model.predict(input_data)[0]
            st.success(f"💰 Estimated Price: ₹ {prediction:,.2f}")
        except Exception as e:
            st.error(f"Prediction failed: {e}")




In [None]:
import pickle
pickle.dump(best_dt, open("decision_tree_model.pkl", "wb"))
pickle.dump(best_rf, open("random_forest_model.pkl", "wb"))
pickle.dump(best_xgb, open("xgb_model.pkl", "wb"))
pickle.dump(best_cat, open("catboost_model.pkl", "wb"))


In [None]:
!pip install streamlit pyngrok


Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [None]:
%%writefile house_price_app.py
# your full streamlit code here
# (paste everything you wrote, including imports)


Writing house_price_app.py
