**Import Libraries**

In [111]:
# Basic setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


**Load and Preview Data**

In [112]:
# Load the Chennai dataset
df = pd.read_csv('chennai_house_price.csv')
df.head()


Unnamed: 0,price,area,status,bhk,bathroom,age,location,builder
0,37.49,872,Ready to move,2,,1.0,Sembakkam,MP Developers
1,93.54,1346,Under Construction,3,2.0,,Selaiyur,DAC Promoters
2,151.0,2225,Under Construction,3,,0.0,Mogappair,Casagrand Builder Private Limited
3,49.0,1028,Ready to move,2,2.0,3.0,Ambattur,Dugar Housing Builders
4,42.28,588,Under Construction,2,1.0,0.0,Pallavaram,Radiance Realty Developers India Ltd


**Explore data**

In [113]:
# Check structure
df.info()

# Summary statistics
df.describe()

# Check for nulls
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2620 entries, 0 to 2619
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     2620 non-null   float64
 1   area      2620 non-null   int64  
 2   status    2620 non-null   object 
 3   bhk       2620 non-null   int64  
 4   bathroom  1403 non-null   float64
 5   age       1729 non-null   float64
 6   location  2620 non-null   object 
 7   builder   2620 non-null   object 
dtypes: float64(3), int64(2), object(3)
memory usage: 163.9+ KB


Unnamed: 0,0
price,0
area,0
status,0
bhk,0
bathroom,1217
age,891
location,0
builder,0


**Clean Data**

In [114]:
# Drop rows with missing values (or impute later)
df.dropna(inplace=True)

# Convert column names to lowercase (optional for consistency)
df.columns = df.columns.str.lower()

# Check unique locations and builders (to handle later)
print("Unique locations:", df['location'].nunique())
print("Unique builders:", df['builder'].nunique())


Unique locations: 104
Unique builders: 63


**Encode Categorical Columns**

In [115]:
# Label encode 'location', 'builder', and 'status'
le_location = LabelEncoder()
le_builder = LabelEncoder()
le_status = LabelEncoder()

df['location'] = le_location.fit_transform(df['location'])
df['builder'] = le_builder.fit_transform(df['builder'])
df['status'] = le_status.fit_transform(df['status'])


**Feature Selection**

In [116]:
# Features and target
X = df[['area', 'bhk', 'bathroom', 'age', 'location', 'builder', 'status']]
y = df['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Train the Model** (***Random Forest***)

In [117]:
# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("R2 Score:", r2_score(y_test, y_pred))


R2 Score: 0.8006450963532741


**Prediction Function**

In [118]:
# Prediction function
def predict_price(area, bhk, bathroom, age, location, builder, status):
    try:
        location_encoded = le_location.transform([location])[0]
        builder_encoded = le_builder.transform([builder])[0]
        status_encoded = le_status.transform([status])[0]

        input_data = pd.DataFrame([{
            'area': float(area),
            'bhk': int(bhk),
            'bathroom': int(bathroom),
            'age': int(age),
            'location': location_encoded,
            'builder': builder_encoded,
            'status': status_encoded
        }])

        prediction = rf_model.predict(input_data)[0]
        return f"🏠 Estimated House Price: ₹{prediction:.2f} lakhs"

    except Exception as e:
        return f"⚠️ Error: {str(e)}"

**Checking available locations**

In [119]:
print("📍 Available Locations:\n", list(le_location.classes_[:20]))
print("🏗️ Available Builders:\n", list(le_builder.classes_[:20]))
print("🏡 Available Statuses:\n", list(le_status.classes_))


📍 Available Locations:
 ['Adyar', 'Agaramthen', 'Alandur', 'Ambattur', 'Anna Nagar', 'Ayanambakkam', 'CIT Nagar', 'Chromepet', 'Egmore', 'Elandanur', 'Gerugambakkam', 'Gokulapuram', 'Gowrivakkam', 'Guduvancheri', 'Guindy', 'Iyappanthangal', 'Iyyapa Nagar', 'Iyyappanthangal', 'Jamalia', 'Jeth Nagar']
🏗️ Available Builders:
 ['24K Realtors', 'AKS REALTY SERVICES', 'ARB HOMES', 'Advaita Homes', 'Alliance Group', 'Amarprakash Developers Pvt Ltd', 'Bala', 'Balasubramani', 'BricksBurg', 'Casagrand Builder Private Limited', 'Chennai Gated Community', 'DAC Promoters', 'DJ Properties', 'Dee Star Properties', 'Dinesh', 'Doshi Housing', 'Dugar Housing Builders', 'Elite nisha', 'GJ ESTATES', 'HM Homes']
🏡 Available Statuses:
 ['Ready to move', 'Under Construction']


**Testing the function**

In [120]:
# Encode categorical values (only once, assuming you have the encoders already)
location_encoded = le_location.transform(['Adyar'])[0]
builder_encoded = le_builder.transform(['Casagrand Builder Private Limited'])[0]
status_encoded = le_status.transform(['Ready to move'])[0]

# Create input sample with matching column order
input_data = pd.DataFrame([{
    'area': 1200,
    'bhk': 2,
    'bathroom': 2,
    'age': 5,
    'location': location_encoded,
    'builder': builder_encoded,
    'status': status_encoded
}])

# Predict
predicted_price = rf_model.predict(input_data)
print(f"🏠 Predicted House Price: ₹{predicted_price[0]:.2f} lakhs")


🏠 Predicted House Price: ₹58.96 lakhs


**Prompting User Input to predict house prices**

In [122]:
# Utility to validate inputs
def get_valid_input(prompt, cast_type=int):
    while True:
        try:
            value = input(prompt)
            if value.strip() == "":
                raise ValueError("Input cannot be empty.")
            return cast_type(value)
        except ValueError:
            print(f"Please enter a valid {cast_type.__name__}.")

# Collect input from user
area = get_valid_input("Enter area in sqft: ", float)
bhk = get_valid_input("Enter number of BHK: ", int)
bathroom = get_valid_input("Enter number of bathrooms: ", int)
age = get_valid_input("Enter age of the property (in years): ", int)
location = input("Enter location (e.g., Adyar, Anna Nagar): ").strip()
builder = input("Enter builder (e.g., Casagrand Builder Private Limited): ").strip()
status = input("Enter status (Ready to move / Under Construction): ").strip()

# Encode categorical inputs
try:
    location_encoded = le_location.transform([location])[0]
    builder_encoded = le_builder.transform([builder])[0]
    status_encoded = le_status.transform([status])[0]
except ValueError as e:
    print("\n⚠️ One or more of your inputs are invalid (not seen during training). Please check spelling and try again.")
    print("Error:", e)
else:
    # Create input DataFrame for prediction
    input_df = pd.DataFrame([{
        'area': area,
        'bhk': bhk,
        'bathroom': bathroom,
        'age': age,
        'location': location_encoded,
        'builder': builder_encoded,
        'status': status_encoded
    }])

    # Predict the price
    predicted_price = rf_model.predict(input_df)[0]
    print(f"\n🏠 Predicted Price: ₹{predicted_price:.2f} lakhs")


Enter area in sqft: 1500
Enter number of BHK: 3
Enter number of bathrooms: 2
Enter age of the property (in years): 5
Enter location (e.g., Adyar, Anna Nagar): Alandur
Enter builder (e.g., Casagrand Builder Private Limited): Alliance Group
Enter status (Ready to move / Under Construction): Ready to move

🏠 Predicted Price: ₹145.59 lakhs
