In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor

In [3]:

df = pd.read_csv("bengaluru_house_prices.csv")
df = df.dropna()
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df['total_sqft'] = pd.to_numeric(df['total_sqft'], errors='coerce')
df = df.dropna(subset=['total_sqft','price','bath'])
df['price_per_sqft'] = (df['price']*100000) / df['total_sqft']
df = df[(df['total_sqft']/df['bhk']) > 300]
df = df[df['bath'] < df['bhk']+3]


In [5]:

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

df = remove_pps_outliers(df)


In [9]:

df_model = df[['location','total_sqft','bath','bhk','balcony','price']]
df_model['location'] = df_model['location'].str.lower().str.strip()

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
loc_encoded = ohe.fit_transform(df_model[['location']])
loc_df = pd.DataFrame(loc_encoded, columns=ohe.get_feature_names_out(['location']))

X = pd.concat([df_model.drop(['location','price'], axis=1).reset_index(drop=True),
               loc_df.reset_index(drop=True)], axis=1)
y = df_model['price']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['location'] = df_model['location'].str.lower().str.strip()


In [11]:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

xgb_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    verbosity=0
)
xgb_model.fit(X_train, y_train)


In [13]:

y_pred = xgb_model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


R2 Score: 0.9146698617831595
MAE: 10.85301667514361


In [18]:

def predict_price(location, sqft, bath, bhk, balcony=1):
    loc = location.lower().strip()
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    x[3] = balcony
    if 'location_'+loc in X.columns:
        loc_index = np.where(X.columns == 'location_'+loc)[0][0]
        x[loc_index] = 1
    return xgb_model.predict([x])[0]


In [20]:
print("Electronic City:", predict_price('Electronic City', 1056, 2, 2, 1))
print("Whitefield:", predict_price('Whitefield', 1500, 3, 3, 2))
print("Indira Nagar:", predict_price('Indira Nagar', 2000, 3, 3, 2))


Electronic City: 51.75222
Whitefield: 80.529144
Indira Nagar: 167.92839


In [22]:
predict_price('Electronic City Phase II',1056,2,2)

33.759075