In [20]:
from warnings import filterwarnings
from time import perf_counter

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer

# Load dataset
df = pd.read_csv("data/processed/mumbai/res_apartment_dataset.csv")

# Define target and feature columns
target = "PRICE"
num_cols = ["AREA"]
cat_cols = ["FURNISH", "AGE", "BEDROOM_NUM", "BALCONY_NUM", "FLOOR_NUM", "FACING", "LOCALITY_NAME"]

# Preprocess data
main_df = df[[target] + num_cols + cat_cols].copy()
main_df["BEDROOM_NUM"] = main_df["BEDROOM_NUM"].apply(lambda x: x if x <= 5 else 99)
main_df["BALCONY_NUM"] = main_df["BALCONY_NUM"].apply(lambda x: x if x <= 4 else 99)

# Define transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(categories=[
            ['unfurnished', 'semifurnished', 'furnished'],
            ['under construction', '0-1 year old property', '1-5 year old property', '5-10 year old property', '10+ year old property'],
            [1.0, 2.0, 3.0, 4.0, 5.0, 99.0],
            [0.0, 1.0, 2.0, 3.0, 4.0, 99.0],
            ['low rise', 'mid rise', 'high rise']
        ]), ['FURNISH', 'AGE', 'BEDROOM_NUM', 'BALCONY_NUM', 'FLOOR_NUM']),
        ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["FACING", "LOCALITY_NAME"]),
        ("log1p_area", FunctionTransformer(np.log1p, np.expm1, validate=True), ["AREA"]),
    ],
    remainder="passthrough",
)

# Split dataset into train and test sets
X = main_df.drop(columns=[target])
y = np.log1p(main_df[target])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(n_estimators=500)),
])

# Train and evaluate model
start_time = perf_counter()
scores = cross_val_score(model, X_train, y_train, cv=5, scoring="r2")
print(f"Fitting model with {scores.mean():.3f} R2 score.")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)) / 100000  # Convert to lakh
print(f"Model takes {(perf_counter() - start_time):.3f} seconds.")
print(f"Mean Absolute Error: {mae:.3f} Lakh.")


Fitting model with 0.911 R2 score.
Model takes 159.248 seconds.
Mean Absolute Error: 40.447 Lakh.


In [None]:
import pickle

# # Save the trained model
# with open("model/price_prediction_model.pkl", "wb") as model_file:
#     pickle.dump(model, model_file)

# Save the preprocessor
with open("model/preprocessor.pkl", "wb") as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)


In [None]:
##Deploying the model
##refer to pages/Price_Prediction.py for the deployment code