## DataFrame

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
df = pd.read_csv("/content/bengaluru_house_prices.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [6]:
df = df.drop(['area_type','society','balcony','availability'],axis='columns')
df.shape

(13320, 5)

In [7]:
df.isnull().sum()

Unnamed: 0,0
location,1
size,16
total_sqft,0
bath,73
price,0


In [8]:
df = df.dropna()
df.isnull().sum()

Unnamed: 0,0
location,0
size,0
total_sqft,0
bath,0
price,0


In [9]:
df.shape

(13246, 5)

In [10]:
df['price'] = df['price']*100000

In [11]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,3907000.0
1,Chikka Tirupathi,4 Bedroom,2600,5.0,12000000.0
2,Uttarahalli,3 BHK,1440,2.0,6200000.0
3,Lingadheeranahalli,3 BHK,1521,3.0,9500000.0
4,Kothanur,2 BHK,1200,2.0,5100000.0


In [12]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.bhk.unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [13]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [14]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,18600000.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,47700000.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,5400500.0,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,4349000.0,2
188,KR Puram,2 BHK,1015 - 1540,2.0,5680000.0,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,1850000.0,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,6377000.0,2
648,Arekere,9 Bedroom,4125Perch,9.0,26500000.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,4813000.0,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,44500000.0,4


In [15]:
def convert_sqft_to_num(x):
    nums = str(x).split('-')
    if len(nums) == 2:
        return (float(nums[0])+float(nums[1]))/2
    try:
        return float(x)
    except:
        return None

df.total_sqft = df.total_sqft.apply(convert_sqft_to_num)
df = df[df.total_sqft.notnull()]
df.loc[:, 'property_type'] = df.apply(lambda row: 'Villa' if row['total_sqft'] > 2200 and row['bhk'] >= 4 else 'Apartment', axis=1)

In [16]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,property_type
0,Electronic City Phase II,2 BHK,1056.0,2.0,3907000.0,2,Apartment
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,12000000.0,4,Villa
2,Uttarahalli,3 BHK,1440.0,2.0,6200000.0,3,Apartment
3,Lingadheeranahalli,3 BHK,1521.0,3.0,9500000.0,3,Apartment
4,Kothanur,2 BHK,1200.0,2.0,5100000.0,2,Apartment


In [17]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,property_type


In [18]:
df.loc[30]

Unnamed: 0,30
location,Yelahanka
size,4 BHK
total_sqft,2475.0
bath,4.0
price,18600000.0
bhk,4
property_type,Villa


In [19]:
df.shape

(13200, 7)

In [20]:
df.drop(['size'],axis='columns',inplace=True)

In [21]:
df.shape

(13200, 6)

In [22]:
df.location = df.location.apply(lambda x: x.strip())
locs = df['location'].value_counts(ascending=False)
locs

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,533
Sarjapur Road,392
Electronic City,304
Kanakpura Road,264
Thanisandra,235
...,...
Zuzuvadi,1
Chellikere,1
Jakkasandra,1
Gulakamale,1


In [23]:
len(locs[locs<=10])

1047

In [24]:
locs_10 = locs[locs<=10]


In [25]:
df.location = df.location.apply(lambda x: 'other' if x in locs_10 else x)
len(df.location.unique())

241

In [26]:
df.shape

(13200, 6)

Logic - Typical bedroom is ~300 sft. If house house[size]/house[bedrooms] < 300, then we remove it. (Because fitting a 1 BHK in 200 sft or 2 BHK in 300 sft is nonsensical)

In [27]:
df = df[~(df.total_sqft/df.bhk<300)]
df.shape

(12456, 6)

Logic - Having more bathrooms than bedrooms is acceptable if bathrooms == bedrooms + 2. More than that is nonsensical

In [28]:
df = df[df.bath<df.bhk+2]
df.shape

(12303, 6)

In [29]:
df.loc[:, 'amenities_score'] = df.apply(lambda row: np.random.randint(7,10) if row['property_type']=='Villa' else np.random.randint(5,10), axis=1)

In [30]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,property_type,amenities_score
0,Electronic City Phase II,1056.0,2.0,3907000.0,2,Apartment,5
1,Chikka Tirupathi,2600.0,5.0,12000000.0,4,Villa,7
2,Uttarahalli,1440.0,2.0,6200000.0,3,Apartment,9
3,Lingadheeranahalli,1521.0,3.0,9500000.0,3,Apartment,5
4,Kothanur,1200.0,2.0,5100000.0,2,Apartment,6


In [31]:
df.to_csv('BangaloreDataMod.csv',index=False)

## MODELS

In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [33]:
df1 = pd.read_csv(BangaloreDataMod.csv)

In [34]:
df1.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,property_type,amenities_score
0,Electronic City Phase II,1056.0,2.0,3907000.0,2,Apartment,5
1,Chikka Tirupathi,2600.0,5.0,12000000.0,4,Villa,7
2,Uttarahalli,1440.0,2.0,6200000.0,3,Apartment,9
3,Lingadheeranahalli,1521.0,3.0,9500000.0,3,Apartment,5
4,Kothanur,1200.0,2.0,5100000.0,2,Apartment,6


In [35]:
X = df1[
    [
        "location",
        "property_type",
        "total_sqft",
        "bath",
        "bhk",
        "amenities_score",
    ]
]

y = df1["price"]


In [36]:
categorical_features = ["location", "property_type"]
numerical_features = [
    "total_sqft", "bath", "bhk", "amenities_score"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [38]:
models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=250,
        max_depth=5,
        random_state=42
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=250,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=250,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ),
    "LightGBM": LGBMRegressor(
        n_estimators=250,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    )
}


In [39]:
results = []

for name, model in models.items():
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ]
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": name,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    })

results_df = pd.DataFrame(results)
print(results_df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 519
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 121
[LightGBM] [Info] Start training from score 10744797.348100
              Model          RMSE           MAE        R2
0      RandomForest  9.606759e+06  3.434130e+06  0.538755
1  GradientBoosting  9.159379e+06  3.175038e+06  0.580714
2           XGBoost  8.579167e+06  3.121150e+06  0.632152
3          LightGBM  9.202139e+06  3.275122e+06  0.576790




In [40]:
best_model_name = results_df.sort_values("R2",ascending=False).iloc[0]["Model"]
best_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", models[best_model_name])
    ]
)

best_pipeline.fit(X, y)

print(f"Selected Model: {best_model_name}")


Selected Model: XGBoost


In [41]:
yield_lookup = {
    "Whitefield": 3.2,
    "Electronic City Phase II": 3.3,
    "Old Airport Road": 2.5,
    "Rajaji Nagar": 2.6,
    "Marathahalli": 3.4
}

def predict_from_json(input_json):
    input_df = pd.DataFrame([input_json])

    predicted_price = float(best_pipeline.predict(input_df)[0])
    price_per_sqft = float(predicted_price / input_json["total_sqft"])

    confidence = {
        "lower": float(predicted_price * 0.9),
        "upper": float(predicted_price * 1.1)
    }

    yield_percent = float(yield_lookup.get(input_json["location"], 3.5))

    return {
        "predicted_price": round(predicted_price, 2),
        "predicted_price_per_sqft": round(price_per_sqft, 2),
        "confidence_interval": confidence,
        "estimated_yield_percent": yield_percent
    }


In [53]:
input_json = {
    "location": "Hebbal",
    "total_sqft": 1800,
    "bhk": 3,
    "bath": 3,
    "property_type": "Villa",
    "amenities_score": 7,
}

output = predict_from_json(input_json)
print(output)


{'predicted_price': 12474121.0, 'predicted_price_per_sqft': 6930.07, 'confidence_interval': {'lower': 11226708.9, 'upper': 13721533.100000001}, 'estimated_yield_percent': 3.5}
