## house price prediction

In [2]:
import pandas as pd

df = pd.read_csv("tirupati_large_cleaned_data.csv")

# Rename columns
df.rename(columns={
    'Area (sqft)': 'area',
    'BHK': 'bhk',
    'Location': 'location',
    'Type': 'type',
    'Price (₹)': 'price',
    'Price per sqft (₹)': 'price_per_sqft'
}, inplace=True)

# Drop unnecessary columns
df.drop(columns=['Title', 'price_per_sqft'], inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, columns=['location', 'type'], drop_first=True)

# Check final structure
print(df.head())


   area  bhk     price  location_Balaji Colony  location_KT Road  \
0  2570    4   8516980                   False             False   
1  1175    2   6303875                    True             False   
2  1318    4   4536556                   False             False   
3  1373    2   6437997                   False             False   
4  2011    2  11251545                   False             False   

   location_Kapila Teertham  location_Leela Mahal  location_MR Palli  \
0                     False                  True              False   
1                     False                 False              False   
2                     False                 False              False   
3                     False                 False              False   
4                      True                 False              False   

   location_RC Road  location_Renigunta Road  location_SV University  \
0             False                    False                   False   
1             

## Split Data

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)  # Features
y = df['price']               # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train a Model

In [4]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Evaluate Model

In [5]:
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = model.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


R² Score: 0.7352566899613704
MAE: 1528369.7655000002


## Try Multiple Models

Import Required Libraries

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

 Define Helper Function

In [7]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    print(f"{model.__class__.__name__}")
    print(f"  R² Score: {r2:.4f}")
    print(f"  MAE: ₹{mae:,.2f}\n")
    return model, r2, mae


## Run All Models

In [8]:
models = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(n_estimators=100, random_state=42),
    XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
]

results = []

for model in models:
    trained_model, r2, mae = evaluate_model(model, X_train, X_test, y_train, y_test)
    results.append((model.__class__.__name__, r2, mae))


LinearRegression
  R² Score: 0.6994
  MAE: ₹1,828,647.35

DecisionTreeRegressor
  R² Score: 0.7050
  MAE: ₹1,536,965.20

RandomForestRegressor
  R² Score: 0.7353
  MAE: ₹1,528,369.77

XGBRegressor
  R² Score: 0.6888
  MAE: ₹1,571,174.25



Summary Table

In [9]:
import pandas as pd

results_df = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE (₹)"])
results_df.sort_values("R2 Score", ascending=False, inplace=True)
print(results_df)


                   Model  R2 Score       MAE (₹)
2  RandomForestRegressor  0.735257  1.528370e+06
1  DecisionTreeRegressor  0.705035  1.536965e+06
0       LinearRegression  0.699409  1.828647e+06
3           XGBRegressor  0.688804  1.571174e+06


## Save the Best Model

In [10]:
import pickle

# Save model
with open('model.pkl', 'wb') as f:
    pickle.dump(RandomForestRegressor(n_estimators=100, random_state=42).fit(X, y), f)

# Save column names
with open('columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)
