In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
# Load dataset
filename = ""
city = input("Enter the city for which you want to find the real estate value for : ")
if city == "Gurgaon":
    filename = "gurgaon_10k.csv"
elif city == "Hyderabad":
    filename = "hyderabad.csv"
elif city == "Mumbai":
    filename = "mumbai.csv"    
original = pd.read_csv(filename)
df = pd.DataFrame(original)
df = df[['CITY', 'PRICE_PER_UNIT_AREA', 'REGISTER_DATE']]
df = df.dropna()

In [3]:
# Function to clean ordinal suffixes and special date cases like "Yesterday" and "Today"
def clean_date(date_str):
    if isinstance(date_str, str):  # Ensure it's a string
        # Handle "Yesterday" and "Today" with specific dates
        if "Yesterday" in date_str:
            return "04th Oct, 2023"  # Replace with custom date for "Yesterday"
        elif "Today" in date_str:
            return "05th Oct, 2023"  # Replace with custom date for "Today"
        # Remove ordinal suffixes (st, nd, rd, th) using regex
        return re.sub(r"(\d+)(st|nd|rd|th)", r"\1", date_str)
    return date_str  # Return the date as is if it's not a string

In [5]:
# Clean the "REGISTER_DATE" column before converting to datetime
df['REGISTER_DATE'] = df['REGISTER_DATE'].apply(clean_date)

# Now, convert the cleaned "REGISTER_DATE" to datetime
df['REGISTER_DATE'] = pd.to_datetime(df['REGISTER_DATE'], format='%d %b, %Y')

# Feature engineering: Extract year and month
df['year'] = df['REGISTER_DATE'].dt.year
df['month'] = df['REGISTER_DATE'].dt.month
# Show the result
print(df)

              CITY  PRICE_PER_UNIT_AREA REGISTER_DATE  year  month
0     Secunderabad               5700.0    2023-07-08  2023      7
1        Hyderabad               6000.0    2023-04-12  2023      4
2        Hyderabad                  0.0    2023-06-08  2023      6
3        Hyderabad              20049.0    2023-06-08  2023      6
4        Hyderabad               8999.0    2023-07-14  2023      7
...            ...                  ...           ...   ...    ...
9482     Hyderabad              19000.0    2023-06-27  2023      6
9483     Hyderabad              11000.0    2023-06-27  2023      6
9484  Secunderabad                 15.0    2023-06-28  2023      6
9485     Hyderabad               6400.0    2023-08-18  2023      8
9486     Hyderabad               5328.0    2023-08-11  2023      8

[9487 rows x 5 columns]


In [6]:
# Filter columns
X = df[['CITY', 'year', 'month']]
y = df['PRICE_PER_UNIT_AREA']
df = df.dropna()

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline with handle_unknown='ignore'
preprocessor = ColumnTransformer(
    transformers=[
        ('city', OneHotEncoder(handle_unknown='ignore'), ['CITY'])  # Handle unknown categories
    ],
    remainder='passthrough'
)

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train model
model.fit(X_train, y_train)

# User Input
year = int(input("Enter the year: "))  # e.g., 2023
month = int(input("Enter the month: "))  # e.g., 11
sq_ft = int(input("Enter the number of square feet you would like to purchase: "))  # e.g., 1000

# Create query for prediction
query_df = pd.DataFrame({
    'CITY': [city],
    'year': [year],
    'month': [month]
})

# Prediction
predicted_price_per_sq_ft = (model.predict(query_df)[0])/5
amount_pred = predicted_price_per_sq_ft * sq_ft

# Output
print(f"Predicted price per square foot: INR {predicted_price_per_sq_ft:.2f}")
print(f"Total amount for {sq_ft} square feet: INR {amount_pred:.2f}")

Predicted price per square foot: INR 12654.21
Total amount for 1200 square feet: INR 15185052.66
