In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
file_path = "data/mumbai-house-price-data-cleaned.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,title,price,area,price_per_sqft,locality,city,property_type,bedroom_num,bathroom_num,balcony_num,furnished,age,total_floors,latitude,longitude
0,Octave Parijas Horizon,6600283,757,8719.0,Kalyan,Mumbai,Apartment,2,2,0,Unfurnished,0,1,19.24441,73.123253
1,Shakti Siyara Heights,6169841,652,9462.946319,Kalyan,Mumbai,Apartment,2,2,0,Unfurnished,0,1,19.257294,73.148872
2,Bhagwati Bhagwati Celeste,4599936,396,11616.0,Dombivali,Mumbai,Apartment,1,1,0,Unfurnished,0,1,19.209026,73.081276
3,Relcon Ridhi Sidhi Sadan Of Ridhi Sidhi Co Ope...,51980000,1130,46000.0,Ville Parle,Mumbai,Apartment,3,3,0,Unfurnished,0,1,19.097841,72.851158
4,J P Ruchita Bliss,3915000,435,9000.0,Nala Sopara,Mumbai,Apartment,1,1,0,Unfurnished,0,1,19.420601,72.809319


In [3]:

#Ageroup add trandformer

class AgeGroupAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Define regular methods for age group conditions
        self.age_groups = {
            'below_15': self.below_15,
            'between_16_30': self.between_16_30,
            'between_31_45': self.between_31_45,
            'between_46_60': self.between_46_60,
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        if 'age' not in X.columns:
            raise ValueError("The input dataframe must contain age column")
        X = X.copy()
        X["age_group"] = X["age"].apply(self.get_age_groups)
        return X

    def get_age_groups(self, age):
        for group, condition in self.age_groups.items():
            if condition(age):
                return group
        return 'unknown'  # Default value if no condition matches

    # Individual methods to check age conditions
    def below_15(self, age):
        return age <= 15

    def between_16_30(self, age):
        return 16 <= age <= 30

    def between_31_45(self, age):
        return 31 <= age <= 45

    def between_46_60(self, age):
        return 46 <= age <= 60
    


# column dropper
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        missing_cols = [col for col in self.columns_to_drop if col not in X.columns]
        if missing_cols:
            raise ValueError(f"The following columns are missing in the DataFrame: {missing_cols}")
        return X.drop(columns=self.columns_to_drop)


# # A custom transformer that add median price column in the data
class MedianPriceAdder(BaseEstimator, TransformerMixin):
    def __init__(self, group_cols, target_col):
        self.group_cols = group_cols
        self.target_col = target_col
        self.median_map = None
    
    def fit(self, X, y=None):
        # Compute median price for each group
        self.median_map = X.groupby(self.group_cols).agg(median_price=(self.target_col, 'median')).astype(int)
        self.median_map = self.median_map.reset_index()
        
        # Create a dictionary mapping (grouped columns) to median values
        self.median_dict = self.median_map.set_index(self.group_cols)['median_price'].to_dict()
        
        return self
    
    def transform(self, X):
        # Create a new column 'median_price' by mapping based on group columns
        X['median_price'] = X.apply(
            lambda row: self.median_dict.get(tuple(row[self.group_cols]), None),
            axis=1
        )
        return X

In [4]:
sample = df.sample(1)
sample


Unnamed: 0,title,price,area,price_per_sqft,locality,city,property_type,bedroom_num,bathroom_num,balcony_num,furnished,age,total_floors,latitude,longitude
23865,Shivalik Bandra North Gulmohar Avenue,15700000,833,18847.539016,Bandra,Mumbai,Apartment,2,2,0,Unfurnished,1,1,19.070414,72.846077


In [5]:
median_adder = MedianPriceAdder(group_cols=["locality", "property_type"], target_col="price")
median_adder.fit(df)


In [6]:
median_adder.transform(sample)

Unnamed: 0,title,price,area,price_per_sqft,locality,city,property_type,bedroom_num,bathroom_num,balcony_num,furnished,age,total_floors,latitude,longitude,median_price
23865,Shivalik Bandra North Gulmohar Avenue,15700000,833,18847.539016,Bandra,Mumbai,Apartment,2,2,0,Unfurnished,1,1,19.070414,72.846077,50000000


In [7]:
transformed_df = median_adder.transform(df)
transformed_df.tail(2)

Unnamed: 0,title,price,area,price_per_sqft,locality,city,property_type,bedroom_num,bathroom_num,balcony_num,furnished,age,total_floors,latitude,longitude,median_price
71936,Alpine Primo,25500000,713,35764.375877,Andheri,Western Mumbai,Apartment,2,2,0,Furnished,0,1,19.12424,72.84276,22500000
71937,Alpine Primo,27384000,978,28000.0,Andheri,Western Mumbai,Apartment,3,3,0,Furnished,0,1,19.12424,72.84276,22500000


In [8]:
# median_price = df.groupby(["locality", "property_type"]).agg(
#     median_price = ("price", "median")
# ).astype(int)

# df = df.merge(median_price, on=["locality", "property_type"], how="left")

In [9]:
X = transformed_df.drop(columns=["title", "price", "locality", "longitude", "latitude", "price_per_sqft"], axis=0)
y = transformed_df["price"]

In [10]:
cat_col = ['city', "property_type", "furnished", "age_group"]
num_col = ["bedroom_num", "area", "bathroom_num", "balcony_num", "total_floors", "median_price"]
# num_col = ["area" ]
transformer = ColumnTransformer(
   transformers=[
       ("ohe", OneHotEncoder(drop="first", sparse_output=True), cat_col),
       ("scalar", StandardScaler(), num_col)
   ],
    remainder="passthrough"
)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn import set_config
set_config(display="diagram")



pipeline = Pipeline([
    ('age_group_adder', AgeGroupAdder()),
    ('column_dropper', ColumnDropper(columns_to_drop=['age'])),
    ("preprocessor", transformer),
    ("regressor", DecisionTreeRegressor(random_state=42)),
])

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test).astype(int)

In [15]:
 np.count_nonzero(y_pred < 0)

0

In [16]:
pipeline.score(X_test, y_test)

0.7867068381377897

In [17]:
import joblib
# import os
# os.chdir('../models/')

In [18]:
joblib.dump(pipeline, '../models/pipeline.pkl')
joblib.dump(median_adder, '../models/median_adder.pkl')
print('dumped')

dumped
