In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
import json

In [6]:
df1 = pd.read_csv(r'C:\Users\Ragha\OneDrive\Desktop\My Python projects\ML projects\BHP\model\BHP.csv')

In [7]:
# Dropping redundant features
df1 = df1.drop(['area_type', 'society', 'availability'], axis = 'columns')

In [8]:
# Filling in numerical features by taking median and dropping the remaining features
# Resetting index after dropping rows
df1['balcony'] = df1['balcony'].fillna(df1['balcony'].median())
df1['bath'] = df1['bath'].fillna(df1['bath'].median())
df1 = df1.dropna()
df1 = df1.reset_index(drop = True)

In [9]:
# Creating a new feature bhk from size to establish uniformity
df1['bhk'] = df1['size'].apply(lambda x: int(x.split(' ')[0]))
df1 = df1.drop('size', axis = 'columns')

In [10]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

In [11]:
# Establishing uniformity by changing range values in total_sqft to float values
df1['total_sqft'] = df1['total_sqft'].apply(convert_sqft_to_num)

In [12]:
# Dropping null values formed as a result of values other than float or range transformed by the function 
df1 = df1.dropna()
df1 = df1.reset_index(drop = True)

In [13]:
# Creating a new feature - price_per_sqft
df1['price_per_sqft'] = df1['price'] * 100000 / df1['total_sqft']

In [14]:
# Categorising those locations as 'Other' if it has less than 10 entries
location_stats = df1.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_states_less_than_10 = location_stats[location_stats < 10]
df1['location'] = df1['location'].apply(lambda x: 'Other' if x in location_states_less_than_10 else x)

In [15]:
# Removing all those values where square feet per room is less than 300 square feet
df1 = df1[~(df1['total_sqft'] / df1['bhk'] < 300)]

In [16]:
# Removing all those price_per_sqft values that are outside the range (m-st, m+st)
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf['price_per_sqft'])
        st = np.std(subdf['price_per_sqft'])
        reduced_df = subdf[(subdf['price_per_sqft'] > (m - st)) & (subdf['price_per_sqft'] <= (m + st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index = True)
    return df_out 

In [17]:
df1 = remove_pps_outliers(df1)

In [18]:
# Removing all those values where for the same square feet area and location, price of an n BHK is less than that of an (n-1) BHK
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df['price_per_sqft']),
                'std': np.std(bhk_df['price_per_sqft']), 
                'count': bhk_df.shape[0] 
                } 
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df['price_per_sqft'] < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis = 'index')

In [19]:
df1 = remove_bhk_outliers(df1)
df1 = df1.reset_index(drop = True)

In [20]:
# Removing entries where number of bathrooms is greater than or equal to number of rooms + 2
df1 = df1[df1.bath < df1.bhk + 2]
# Removing price_per_sqft because it was used only to detect outliers that has now been done. Balcony is also dropped.
df1 = df1.drop(['price_per_sqft', 'balcony'], axis = 'columns')

In [21]:
# One hot encoding
dummies = pd.get_dummies(df1['location'])
dummies = dummies.drop('Other', axis = 'columns')
df1 = pd.concat([df1, dummies], axis = 'columns')
df1 = df1.drop('location', axis = 'columns')

In [22]:
X = df1.drop('price', axis = 'columns')
Y = df1['price']


In [23]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size = 0.8, random_state = 963)

model = LinearRegression()
model.fit(x_train, y_train)

In [24]:
# Function to predict house price
def predict_price(location, total_sqft, bath, bhk):
    loc_index = np.where(X.columns == location)[0][0]
    x = np.zeros(len(X.columns))
    x[0] = total_sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1
    return model.predict([x])[0]


In [28]:
with open(r'C:\Users\Ragha\OneDrive\Desktop\My Python projects\ML projects\BHP\model\Bangalore_house_prices_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [31]:
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open(r'C:\Users\Ragha\OneDrive\Desktop\My Python projects\ML projects\BHP\model\columns.json', 'w') as col:
    col.write(json.dumps(columns))