In [40]:
# import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer # for multi-hot encoding

In [41]:
# load daataset after eda
india_housing_prices_df = pd.read_csv("../data/eda_data/india_housing_prices_eda.csv")
india_housing_prices_df.describe()

Unnamed: 0,ID,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,Floor_No,Total_Floors,Age_of_Property,Nearby_Schools,Nearby_Hospitals
count,230277.0,230277.0,230277.0,230277.0,230277.0,230277.0,230277.0,230277.0,230277.0,230277.0,230277.0
mean,125025.333121,2.998602,2913.324427,243.190549,0.1002,2006.522697,14.974218,15.493401,18.477303,5.498721,5.496116
std,72183.166351,1.415131,1221.722929,139.667429,0.072521,9.805546,8.949023,8.672969,9.805546,2.878392,2.872477
min,1.0,1.0,500.0,10.0,0.002022,1990.0,0.0,1.0,2.0,1.0,1.0
25%,62551.0,2.0,1890.0,123.21,0.044604,1998.0,7.0,8.0,10.0,3.0,3.0
50%,125002.0,3.0,2926.0,236.02,0.085469,2007.0,15.0,15.0,18.0,5.0,5.0
75%,187579.0,4.0,3965.0,361.21,0.136523,2015.0,23.0,23.0,27.0,8.0,8.0
max,250000.0,5.0,5000.0,500.0,0.32764,2023.0,30.0,30.0,35.0,10.0,10.0


In [42]:
# TODO: ADD MARKDOWN FILE => in eda part we have already checked for missing, duplicate values

In [43]:
# we are storing the freq encoding map for encoding the user's input through streamlit
state_freq_map = india_housing_prices_df['State'].value_counts(normalize=True).to_dict()
city_freq_map  = india_housing_prices_df['City'].value_counts(normalize=True).to_dict()

# encode State, City using Frequency Encoding
india_housing_prices_df['State_Freq'] = india_housing_prices_df['State'].map(state_freq_map)
india_housing_prices_df['City_Freq']  = india_housing_prices_df['City'].map(city_freq_map)

# encode 'Locality' column using target encoding
locality_target_mean_map = india_housing_prices_df.groupby(['State', 'City', 'Locality'])['Price_per_SqFt'].mean().to_dict()

# storing the locality mapping 
india_housing_prices_df = india_housing_prices_df

india_housing_prices_df = india_housing_prices_df.drop(['State', 'City', 'Locality'], axis=1)
display(india_housing_prices_df.head())

Unnamed: 0,ID,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,Furnished_Status,Floor_No,Total_Floors,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status,State_Freq,City_Freq
0,1,Apartment,1,4740,489.76,0.103325,1990,Furnished,22,1,35,10,3,High,No,No,"['Playground', 'Gym', 'Garden', 'Pool', 'Clubh...",West,Owner,Ready_to_Move,0.050387,0.024518
1,2,Independent House,3,2364,195.52,0.082707,2008,Unfurnished,21,20,17,8,1,Low,No,Yes,"['Playground', 'Clubhouse', 'Pool', 'Gym', 'Ga...",North,Builder,Under_Construction,0.050296,0.016867
2,3,Apartment,2,3642,183.79,0.050464,1997,Semi-furnished,19,27,28,9,8,Low,Yes,No,"['Clubhouse', 'Pool', 'Playground', 'Gym']",South,Broker,Ready_to_Move,0.050196,0.02527
3,4,Independent House,2,2741,300.29,0.109555,1991,Furnished,21,26,34,5,7,High,Yes,Yes,"['Playground', 'Clubhouse', 'Gym', 'Pool', 'Ga...",North,Builder,Ready_to_Move,0.049701,0.024766
4,5,Villa,4,4823,182.9,0.037922,2002,Semi-furnished,3,2,23,4,9,Low,No,Yes,"['Playground', 'Garden', 'Gym', 'Pool', 'Clubh...",East,Builder,Ready_to_Move,0.049701,0.024935


1. **`Locality_Target` already includes State and City info** because:
   - `locality_84` in Mumbai, MH â‰  `locality_84` in Bangalore, KA
   - The groupby uses all three levels: `['State', 'City', 'Locality']`

2. **Encoding State/City separately would be REDUNDANT**
   - You'd be giving the model the same geographic information twice
   - Wastes features and could cause multicollinearity

In [44]:
# Label encoding for remaining categorical columns 
label_encoding_cols = [
    'Property_Type', 'Furnished_Status', 'Public_Transport_Accessibility',
    'Parking_Space', 'Security', 'Facing', 'Owner_Type', 'Availability_Status'
]

le_dict = {}
for col in label_encoding_cols:
    le = LabelEncoder()
    india_housing_prices_df[col + '_Enc'] = le.fit_transform(india_housing_prices_df[col])
    le_dict[col] = le  # for later use during prediction/streamlit
    
india_housing_prices_df = india_housing_prices_df.drop(label_encoding_cols, axis=1)
display(india_housing_prices_df.head())

Unnamed: 0,ID,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,Floor_No,Total_Floors,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Amenities,State_Freq,City_Freq,Property_Type_Enc,Furnished_Status_Enc,Public_Transport_Accessibility_Enc,Parking_Space_Enc,Security_Enc,Facing_Enc,Owner_Type_Enc,Availability_Status_Enc
0,1,1,4740,489.76,0.103325,1990,22,1,35,10,3,"['Playground', 'Gym', 'Garden', 'Pool', 'Clubh...",0.050387,0.024518,0,0,0,0,0,3,2,0
1,2,3,2364,195.52,0.082707,2008,21,20,17,8,1,"['Playground', 'Clubhouse', 'Pool', 'Gym', 'Ga...",0.050296,0.016867,1,2,1,0,1,1,1,1
2,3,2,3642,183.79,0.050464,1997,19,27,28,9,8,"['Clubhouse', 'Pool', 'Playground', 'Gym']",0.050196,0.02527,0,1,1,1,0,2,0,0
3,4,2,2741,300.29,0.109555,1991,21,26,34,5,7,"['Playground', 'Clubhouse', 'Gym', 'Pool', 'Ga...",0.049701,0.024766,1,0,0,1,1,1,1,0
4,5,4,4823,182.9,0.037922,2002,3,2,23,4,9,"['Playground', 'Garden', 'Gym', 'Pool', 'Clubh...",0.049701,0.024935,2,1,1,0,1,0,1,0


In [45]:
import re

def clean_column_name(name):
    """Remove special characters that XGBoost doesn't accept"""
    name = re.sub(r'[\[\]<>]', '', name)  # Remove brackets
    name = name.replace(' ', '_')          # Replace spaces
    name = re.sub(r'[^a-zA-Z0-9_]', '', name)  # Remove other special chars
    name = re.sub(r'_+', '_', name)        # Remove consecutive underscores
    return name.strip('_')

mlb = MultiLabelBinarizer()
amenities_encoded = mlb.fit_transform(india_housing_prices_df['Amenities'])

# Clean the column names!
amenities_df = pd.DataFrame(
    amenities_encoded, 
    columns=[f"Amenity_{clean_column_name(a)}" for a in mlb.classes_]
)

india_housing_prices_df = pd.concat([india_housing_prices_df, amenities_df], axis=1)
india_housing_prices_df = india_housing_prices_df.drop('Amenities', axis=1)

In [46]:
india_housing_prices_df.to_csv("../data/feature_engineered_data/india_housing_prices_feature_engineered.csv", index=False)