In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score


In [8]:
df = pd.read_csv("india_housing_prices.csv")
df = df.drop_duplicates()
df.columns = df.columns.str.lower()
df = df.drop(columns=['id', 'price_per_sqft'])
df.head(5)

Unnamed: 0,state,city,locality,property_type,bhk,size_in_sqft,price_in_lakhs,year_built,furnished_status,floor_no,...,age_of_property,nearby_schools,nearby_hospitals,public_transport_accessibility,parking_space,security,amenities,facing,owner_type,availability_status
0,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,1990,Furnished,22,...,35,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,2008,Unfurnished,21,...,17,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,1997,Semi-furnished,19,...,28,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,1991,Furnished,21,...,34,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,2002,Semi-furnished,3,...,23,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [13]:
#Feature Engineering
from datetime import datetime
df["size_in_sqft"] = df["size_in_sqft"].replace(0, np.nan)
df = df.dropna(subset=["size_in_sqft", "price_in_lakhs", "year_built"])

df['price_per_sqft'] = df['price_in_lakhs']*100000 / df['size_in_sqft']

current_year = datetime.now().year
df['property_age'] = current_year - df['year_built']

df['city_median_ppsf'] = (df.groupby('city')['price_per_sqft'].transform('median'))

df['relative_price_index'] = df['price_per_sqft']/df['city_median_ppsf']

# Target Label

df['good_investment'] = (
    (df['relative_price_index']<1) &
    (df['property_age']<=10)
).astype(int)

df.head(3)

Unnamed: 0,state,city,locality,property_type,bhk,size_in_sqft,price_in_lakhs,year_built,furnished_status,floor_no,...,security,amenities,facing,owner_type,availability_status,price_per_sqft,property_age,city_median_ppsf,relative_price_index,good_investment
0,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,1990,Furnished,22,...,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move,10332.489451,36,9328.557474,1.107619,0
1,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,2008,Unfurnished,21,...,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction,8270.72758,18,9538.744119,0.867067,0
2,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,1997,Semi-furnished,19,...,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move,5046.403075,29,9154.488518,0.551249,0


In [16]:
num_cols = num_cols = [
    "bhk",
    "size_in_sqft",
    "price_in_lakhs",
    "price_per_sqft",
    "property_age",
    "relative_price_index"
]

cat_cols = [
    "state",
    "city",
    "property_type",
    "furnished_status",
    "facing",
    "owner_type",
    "availability_status"
]

df[cat_cols].nunique()

state                  20
city                   42
property_type           3
furnished_status        3
facing                  4
owner_type              3
availability_status     2
dtype: int64

In [None]:
# Train-Test Split

from sklearn.model_selection import train_test_split, GridSearchCV
X = num_cols + cat_cols
y = df['good_investment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
preprocessor = ColumnTransformer(
    transformers= [('num', StandardScaler(), num_cols),
                   ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), cat_cols)]
)

In [None]:
# Logistic Regression, Decision Trees, Support Vector Machines (SVM), K-Nearest Neighbors (KNN), and Naive Bayes
from sklearn.linear_model import LogisticRegression
pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        ))
    ]
)