# This notebook made by: https://github.com/Strikoder

# Imports and initilizations 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Libraries - basic
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
matplotlib.rcParams["figure.figsize"] = (20,10)
import seaborn as sns
import missingno as msno #Heatmap
import warnings
warnings.filterwarnings('ignore')

#Scikitlearn
import sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


#ANN
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU,PReLU,ELU
from keras.layers import Dropout

import pickle
pd.set_option('display.max_rows', None)

In [None]:
prices_df=pd.read_csv('/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
prices_df.head()

# Data preprocessing

In [None]:
print(f'shape: {prices_df.shape}\n\ndescription: {prices_df.describe()}\n\n columns:\n{prices_df.columns}')

In [None]:
prices_df.loc[:, prices_df.isnull().any()].count()

In [None]:
sns.heatmap(prices_df.isnull(),yticklabels=False,cbar=False,cmap='YlGnBu')

* Dropping less important features

In [None]:
prices_df.drop(["area_type", "society","balcony", "availability"], axis = 1,inplace=True)

In [None]:
prices_df.shape

In [None]:
prices_df.loc[:, prices_df.isnull().any()].count()

In [None]:
prices_df = prices_df.dropna()
print(f'NULL values: {prices_df.loc[:, prices_df.isnull().any()].count()}, shape: {prices_df.shape}')


* Feature Engineering

In [None]:
prices_df["size"].unique()

We notice that we have numerical and categorical values seperated by spaces, hence we use split function to consider the numerical values only.

In [None]:
#BHK column refers to "Bedrooms/ Hall/ Kitchen"
prices_df['BHK'] = prices_df["size"].apply(lambda x: int(x.split(" ")[0]))

In [None]:
print(prices_df.head())
print(f'Numbers of unique BHK: {sorted(prices_df.BHK.unique())}')

* Exploring the non-numerical data through the funciton we made, which return true on floats values, thus we can print all the non-float values utilizing the "~" sign and applying the function.

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
prices_df[~prices_df['total_sqft'].apply(is_float)].head(10)

In [None]:
# This function takes the col's values as an argument and returns the mean
def convert_sqft_to_number(x):
    tokens = x.split("-")
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
prices_df["total_sqft"] = prices_df["total_sqft"].apply(convert_sqft_to_number)

In [None]:
#printing the row № 30,410 cause it had the non-float value (mentiond above)
print(f'{prices_df.loc[30]}, {prices_df.loc[410]}')

In [None]:
# Creating a new feature considering that prices in Rupee
# We created this feature to have an wide observation about the outliers
prices_df["price_per_sqft"] = prices_df["price"]*100000/prices_df["total_sqft"]
prices_df.head()

* Using dimension reduction

In [None]:
# here we apply lambda to strip any leading or trailing whitespaces in the 'location' column 
# of the DataFrame 'prices_df' before computing the value counts.
prices_df['location'] = prices_df['location'].apply(lambda x: x.strip())
location_stats = prices_df['location'].value_counts(ascending=False)
print(location_stats)

In [None]:
print(f'The length of the location_stats feature is: {len(location_stats)} ')

In [None]:
len(location_stats[location_stats<=10])

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
print(location_stats_less_than_10)

In [None]:
prices_df['location']=prices_df['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(prices_df['location'].unique())

In [None]:
prices_df.head()

* Outliers removal utilizing business principles 

In [None]:
print(f"{prices_df[(prices_df['total_sqft']/prices_df['BHK'])<300].head()}\n\n{len(prices_df[(prices_df['total_sqft']/prices_df['BHK'])<300])}")


In [None]:
# Through this feature we observer that the max value is so high
print(prices_df['price_per_sqft'].describe())
print(f'\n\n{prices_df.shape}')

In [None]:
# Thus we try to remove outliers using mean and std values
def remove_outliers(df):
    df_output = pd.DataFrame()
    for key, sub_df in df.groupby('location'):
        mean = np.mean(sub_df['price_per_sqft'])
        std = np.std(sub_df['price_per_sqft'])
        reduced_df = sub_df[(sub_df['price_per_sqft']>(mean-std)) & (sub_df['price_per_sqft']<=(mean+std))]
        df_output = pd.concat([df_output,reduced_df],ignore_index=True)
    return df_output
prices_df = remove_outliers(prices_df)
prices_df.shape

Here we try to check the logic, for instance if the house has 2 BHK has high value than the house with 3 of them.

In [None]:
def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.BHK==2)]
    bhk3 = df[(df.location==location) & (df.BHK==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()

plot_scatter_chart(prices_df,"Rajaji Nagar")

In [None]:
plot_scatter_chart(prices_df,"Hebbal")

In [None]:
# This function removes the BHK houses if they are below BHK-1 mean 
def remove_bhk_outliers(df):
    exclude_indices = np.array([])

    for location, location_df in df.groupby('location'):
        bhk_stats = {}

        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = \
            {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }

        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)

    return df.drop(exclude_indices,axis='index')

In [None]:
print(prices_df.shape)
prices_df=remove_outliers(prices_df)
print(prices_df.shape)

In [None]:
plot_scatter_chart(prices_df,"Hebbal")

Through the chart above we observer that some of the outliers has been removed

In [None]:
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(prices_df['price_per_sqft'],rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [None]:
# Checking number of bathrooms
print(sorted(prices_df['bath'].unique()))

It's uncommon to have 3 more bathrooms than the number of bedrooms in a house, thus we gonna filter the data based on this fact.

In [None]:
print(prices_df.shape)
prices_df=prices_df[prices_df['bath']<prices_df['BHK']+2]
print(prices_df.shape)

In [None]:
prices_df.drop(['size','price_per_sqft'],axis='columns',inplace=True)
prices_df.head()

In [None]:
#Onehot_encoding
def onehot_encoding(final_df,cat_columns):
    encoded_df=final_df
    i=0
    for column in cat_columns:
        print(column)
        df1=pd.get_dummies(final_df[column],drop_first=True)
        final_df.drop([column],axis=1,inplace=True)
        if i==0:
             encoded_df=df1.copy()
        else:     
             encoded_df=pd.concat([ encoded_df,df1],axis=1)
        i=i+1
       
    encoded_df=pd.concat([final_df, encoded_df],axis=1)
        
    return encoded_df

In [None]:
prices_df=onehot_encoding(prices_df,['location'])

In [None]:
prices_df=prices_df.loc[:,~prices_df.columns.duplicated()] #Remove duplicated columns

In [None]:
prices_df.head()

In [None]:
X = prices_df.drop(['price'],axis='columns')
y = prices_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

# ML

In [None]:
# Define the models and their hyperparameters
model_params = {
    'xgb_regressor' : {
        'model': XGBRegressor(),
        'params':{
            'objective':['reg:squarederror'],
            'n_estimators': [100, 900, 1500],
            'max_depth':[2, 3, 5, 10, 15],
            'learning_rate':[0.025,0.1,0.15,0.20],
            'min_child_weight':[1,2,3,4],
            'booster':['gbtree','gblinear'],
            'base_score':[0.25,0.5,0.75,1]
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [10, 50, 100],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'linear_regression': {
        'model': LinearRegression(),
        'params': {
            'normalize': [True, False]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'criterion': ['mse', 'friedman_mse'],
            'splitter': ['best', 'random'],
            'max_depth': [None, 10, 20]
        }
    },
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [1,2],
            'selection': ['random', 'cyclic']
        }
    }
}

# Run a grid search to find the best hyperparameters for each model
best_scores = []
for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'], cv=5, random_state=42)
    clf.fit(X_train, y_train)
    best_scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

# Find the best algorithm
best_model_name = max(best_scores, key=lambda x: x['best_score'])['model']
print("Best algorithm:", best_model_name)

# Get the best model object and its parameters
best_model = model_params[best_model_name]['model']
best_params = next(item for item in best_scores if item['model'] == best_model_name)['best_params']

# Fit the best algorithm into the pipeline
steps = [('imputer', SimpleImputer()), 
         ('scaler', StandardScaler()), 
         (best_model_name, best_model.set_params(**best_params))]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

# Fit the pipeline and compute its evaluation metric
if best_model_name == 'linear_regression':
    predictions = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print("MSE:", mse)
else:
    score = pipeline.score(X_test, y_test)
    print("Score:", score)

#pd.DataFrame(scores,columns=['model','best_score','best_params'])

# AI

In [None]:
# Initialising the Artifical Neural Network
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(50, kernel_initializer = 'he_uniform',activation='relu',input_dim = 244))

# Adding the second hidden layer
classifier.add(Dense(50, kernel_initializer = 'he_uniform',activation='relu'))

# Adding the third hidden layer
classifier.add(Dense(50, kernel_initializer = 'he_uniform',activation='relu'))
# Adding the output layer
classifier.add(Dense(1, kernel_initializer = 'he_uniform'))

# Compiling the ANN
classifier.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='Adamax')

# Fitting the ANN to the Training set
model_history=classifier.fit(X_train.values, y_train.values,validation_split=0.20, batch_size = 10, epochs = 500)

In [None]:
#Testing the model
def predict_price(classifier, location, sqft, bath, bhk):
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    # Reshape the array to (1, len(X.columns))
    x = x.reshape(1, -1)

    return classifier.predict(x)[0]

In [None]:
print(f'Best_algo prediciton:{predict_price(pipeline,"1st Phase JP Nagar",1000, 2, 2)}')
print(f'ANN prediciton:{predict_price(classifier,"1st Phase JP Nagar",1000, 2, 2)}')

In [None]:
print(f'Best_algo prediciton:{predict_price(pipeline,"Indira Nagar",1000, 3, 3)}')
print(f'ANN prediciton:{predict_price(classifier,"Indira Nagar",1000, 3, 3)}')

ANN output seems more rational, thus we going to export it

# Exporting

In [None]:
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(classifier,f)

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

# Resoruces: Codebasics end-to-end tutorial