In [453]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Load data from the correct path
df = pd.read_csv(r"C:\Users\PC\Desktop\Bengaluru_House_Data.csv")
df.head()

In [None]:
df.drop(columns = ['availability', 'society', 'balcony'], inplace = True)
df.head()

In [None]:
df['size'].unique()

In [None]:
df['bedroom'] = df['size'].str.split(' ', expand = True)[0]

In [None]:
df.dropna(inplace = True)

In [None]:
df['bedroom'] = df['bedroom'].astype(int)

In [None]:
df.info()

In [None]:
df.drop('size', axis = 'columns', inplace = True)

In [None]:
df[df['bedroom'] > 15]

In [None]:
def normalize(x):
    char = x.split('-')
    if len(char) == 2:
        return (float(char[0]) + float(char[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
df['total_sqft'] = df['total_sqft'].apply(normalize)

In [None]:
df.dropna(inplace = True)

In [None]:
df1 = df.copy()
df1.head()
df.info()

In [None]:
loc_list = df1.groupby('location')['location'].count().sort_values(ascending = False)
loc_less_than_11 = loc_list[loc_list <= 10]
loc_list

In [None]:
df1['location'] = df1['location'].apply(lambda x: 'others' if x in loc_less_than_11 else x)
df1['location'].value_counts()

In [474]:
df1['price_per_sqft'] = (df['price']*100000/df['total_sqft']).round()
df2 = df1[~((df1['total_sqft']/df1['bedroom']) < 300)]
df2.shape

(12456, 7)

In [None]:
df2['price_per_sqft'].describe()

In [None]:
from pprint import pprint

In [None]:
def remove_outliers(df):
    final_df = pd.DataFrame()
    for key, subdf in df.groupby('location'):

        mean = subdf['price_per_sqft'].mean()
        std = subdf['price_per_sqft'].std()

        reduced_subdf = subdf[(subdf['price_per_sqft'] >= mean - std) & (subdf['price_per_sqft'] <= mean + std)]
        final_df = pd.concat([final_df, reduced_subdf], ignore_index = True)
        
    return final_df

In [None]:
df2 = remove_outliers(df1)
df2.shape

In [None]:
df2.head()

In [None]:
def remove_bedroom_outliers(df):
    outliers_indices = []
    for location, location_df in df.groupby('location'):
        stats = location_df.groupby('bedroom')['price_per_sqft'].agg(['mean', 'std', 'count'])
    
        for bedroom, bedroom_df in location_df.groupby('bedroom'):
            if bedroom-1 in stats.index and stats.loc[bedroom - 1, 'count']>5:
                outliers = bedroom_df[bedroom_df['price_per_sqft'] > stats.loc[bedroom-1, 'mean']]
                outliers_indices.extend(outliers.index.to_list())
                
    return df.drop(outliers_indices)

In [None]:
df3 = remove_bedroom_outliers(df2)
df3.drop(columns = ['price_per_sqft', 'area_type'], inplace = True)
df3.shape

In [None]:
df3.info()

## df4 -- OneHotEncoding

In [484]:
ohe = OneHotEncoder(sparse_output = False, drop = 'first')
encoded_features = ohe.fit_transform(df3[['location']])
encoded_features_df = pd.DataFrame(encoded_features, columns = ohe.get_feature_names_out(['location']), index = df3.index)
encoded_features_df.head()

Unnamed: 0,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,location_5th Phase JP Nagar,location_6th Phase JP Nagar,location_7th Phase JP Nagar,location_8th Phase JP Nagar,location_9th Phase JP Nagar,...,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_others,Unnamed: 22
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [447]:
df4 = pd.concat([encoded_features_df, df3.drop(columns = ['location'], axis = 'columns')], axis = 'columns')
df4.head()

Unnamed: 0,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,location_5th Phase JP Nagar,location_6th Phase JP Nagar,location_7th Phase JP Nagar,location_8th Phase JP Nagar,location_9th Phase JP Nagar,...,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_others,total_sqft,bath,price,bedroom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1250.0,2.0,44.0,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1250.0,2.0,40.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1200.0,2.0,83.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1170.0,2.0,40.0,2
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,947.0,2.0,43.0,2


In [449]:
X = df4.drop(columns = ['price'], axis = 'columns')
y = df4['price']

In [450]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
X_test.shape

(1559, 243)

In [445]:
algorithms = {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {
            'alpha': [1, 2],
            'selection': ['random','cyclic']
        }
    },
    'ridge': {
        'model': Ridge(),
        'params': {
            'alpha':[1, 2],
            'solver': ['auto', 'sag', 'cholesky']
        }
    }
}

scores = []
for name, model in algorithms.items():
    clf = GridSearchCV(
        model['model'],
        param_grid = model['params'],
        cv = 5,
        n_jobs = -1
    )
    clf.fit(X_train, y_train)
    scores.append(
        {
            'model': name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        }
    )
pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])

Unnamed: 0,model,best_score,best_params
0,lasso,0.347106,"{'alpha': 1, 'selection': 'cyclic'}"
1,ridge,0.548276,"{'alpha': 1, 'solver': 'auto'}"


In [455]:
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.759768205700183

In [491]:
def predict_price(location, sqft, bath, bedroom):
    input_data = pd.DataFrame(columns = X.columns)
    
    if not location.startswith('location_'):
        location = f'location_{location}'
    
    input_data.loc[0] = 0
    
    input_data['total_sqft'] = sqft
    input_data['bath'] = bath
    input_data['bedroom'] = bedroom
    
    if location in input_data.columns:
        input_data[location] = 1
    else:
        print(f'Warning. location {location} not in training data. Defaulting to 0')
    
    return reg.predict(input_data)[0]

In [497]:
predict_price('1st Phase JP Nagar', 1000, 2, 2)

94.34775797771263

In [495]:
predict_price('Indira Nagar', 1000, 2, 2)

127.86401801240356

In [502]:
import os
import pickle
import json

with open('bengaluru_prediction_model.pkl', 'wb') as f:
    pickle.dump(reg, f)

In [505]:
columns = {
    'data_columns': [col.lower() for col in X.columns]
}

with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))