<a href="https://colab.research.google.com/github/Royarind/Machine_Learning_Projects/blob/main/Bangalore_House_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [190]:
## Dataset link : https://www.kaggle.com/datasets/amitabhajoy/bengaluru-house-price-data

In [191]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [192]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [193]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [195]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [196]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [197]:
df['area_type'].value_counts()

Unnamed: 0_level_0,count
area_type,Unnamed: 1_level_1
Super built-up Area,8790
Built-up Area,2418
Plot Area,2025
Carpet Area,87


In [198]:
df['size'].value_counts()

Unnamed: 0_level_0,count
size,Unnamed: 1_level_1
2 BHK,5199
3 BHK,4310
4 Bedroom,826
4 BHK,591
3 Bedroom,547
1 BHK,538
2 Bedroom,329
5 Bedroom,297
6 Bedroom,191
1 Bedroom,105


#Data Cleaning

In [199]:
# Bedroom has a lot of figures, we will take upto 6 bedroom.
# We will remove "Bedroom","BHK" etc.

In [200]:
df['size'] = df['size'].str.split(" ").str.get(0)

In [201]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2,,1200,2.0,1.0,51.0


In [202]:
df['size'] = pd.to_numeric(df['size'])

In [203]:
df = df[df['size'] < 6] # Upto 5 bed

In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12814 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     12814 non-null  object 
 1   availability  12814 non-null  object 
 2   location      12813 non-null  object 
 3   size          12814 non-null  float64
 4   society       7788 non-null   object 
 5   total_sqft    12814 non-null  object 
 6   bath          12757 non-null  float64
 7   balcony       12321 non-null  float64
 8   price         12814 non-null  float64
dtypes: float64(4), object(5)
memory usage: 1001.1+ KB


In [205]:
for col in df.columns:
    print(df[col].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8740
Built-up  Area          2321
Plot  Area              1670
Carpet  Area              83
Name: count, dtype: int64
********************
availability
Ready To Move    10108
18-Dec             307
18-May             293
18-Apr             266
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 80, dtype: int64
********************
location
Whitefield           538
Sarjapur  Road       397
Electronic City      301
Kanakpura Road       273
Thanisandra          231
                    ... 
Park View Layout       1
Halanayakanahalli      1
Chikkanahalli          1
Rahmath Nagar          1
Gulakamale             1
Name: count, Length: 1243, dtype: int64
********************
size
2.0    5528
3.0    4857
4.0    1417
1.0     656
5.0     356
Name: count, dtype: int64
********************
society
GrrvaGr    80
PrarePa    76
Prtates  

In [206]:
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [207]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2.0,1056,2.0,39.07
1,Chikka Tirupathi,4.0,2600,5.0,120.0
2,Uttarahalli,3.0,1440,2.0,62.0
3,Lingadheeranahalli,3.0,1521,3.0,95.0
4,Kothanur,2.0,1200,2.0,51.0


In [208]:
df["total_price"] = df["price"] * 100000

In [209]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,total_price
0,Electronic City Phase II,2.0,1056,2.0,39.07,3907000.0
1,Chikka Tirupathi,4.0,2600,5.0,120.0,12000000.0
2,Uttarahalli,3.0,1440,2.0,62.0,6200000.0
3,Lingadheeranahalli,3.0,1521,3.0,95.0,9500000.0
4,Kothanur,2.0,1200,2.0,51.0,5100000.0


In [210]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12814 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   location     12813 non-null  object 
 1   size         12814 non-null  float64
 2   total_sqft   12814 non-null  object 
 3   bath         12757 non-null  float64
 4   price        12814 non-null  float64
 5   total_price  12814 non-null  float64
dtypes: float64(4), object(2)
memory usage: 700.8+ KB


In [211]:
df['location'].fillna('Sarjapur Road',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna('Sarjapur Road',inplace=True)


In [212]:
df['bath'].fillna(df['bath'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bath'].fillna(df['bath'].median(),inplace=True)


In [213]:
# In total_sqft we have values like 1133 - 1384. we will split this value and take the average of two
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [214]:
  def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [215]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

In [216]:
df['total_sqft'].fillna(df['total_sqft'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_sqft'].fillna(df['total_sqft'].median(),inplace=True)


In [217]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12814 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   location     12814 non-null  object 
 1   size         12814 non-null  float64
 2   total_sqft   12814 non-null  float64
 3   bath         12814 non-null  float64
 4   price        12814 non-null  float64
 5   total_price  12814 non-null  float64
dtypes: float64(5), object(1)
memory usage: 700.8+ KB


In [218]:
df['total_sqft'].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [219]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,total_price
0,Electronic City Phase II,2.0,1056.0,2.0,39.07,3907000.0
1,Chikka Tirupathi,4.0,2600.0,5.0,120.0,12000000.0
2,Uttarahalli,3.0,1440.0,2.0,62.0,6200000.0
3,Lingadheeranahalli,3.0,1521.0,3.0,95.0,9500000.0
4,Kothanur,2.0,1200.0,2.0,51.0,5100000.0


In [220]:
df['price_per_sqft'] = df['total_price']/df['total_sqft']

In [221]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,total_price,price_per_sqft
0,Electronic City Phase II,2.0,1056.0,2.0,39.07,3907000.0,3699.810606
1,Chikka Tirupathi,4.0,2600.0,5.0,120.0,12000000.0,4615.384615
2,Uttarahalli,3.0,1440.0,2.0,62.0,6200000.0,4305.555556
3,Lingadheeranahalli,3.0,1521.0,3.0,95.0,9500000.0,6245.890861
4,Kothanur,2.0,1200.0,2.0,51.0,5100000.0,4250.0


In [222]:
df['location'] = df['location'].apply(lambda x: x.strip())
location_data = df['location'].value_counts()
location_data

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,539
Sarjapur Road,397
Electronic City,303
Kanakpura Road,273
Thanisandra,234
...,...
Sarvobhogam Nagar,1
Kanakapur main road,1
Sindhi Colony,1
Masjid e Alkareem,1


In [223]:
#Location count less than 10
location_less_than_10 = location_data[location_data<=10]

In [224]:
df['location'] = df['location'].apply(lambda x: "other" if x in location_less_than_10 else x)

In [225]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,total_price,price_per_sqft
0,Electronic City Phase II,2.0,1056.0,2.0,39.07,3907000.0,3699.810606
1,Chikka Tirupathi,4.0,2600.0,5.0,120.0,12000000.0,4615.384615
2,Uttarahalli,3.0,1440.0,2.0,62.0,6200000.0,4305.555556
3,Lingadheeranahalli,3.0,1521.0,3.0,95.0,9500000.0,6245.890861
4,Kothanur,2.0,1200.0,2.0,51.0,5100000.0,4250.0


In [226]:
column_trans = make_column_transformer((OneHotEncoder(sparse_output=False), ['location']),remainder='passthrough')

In [227]:
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [228]:
scaler = StandardScaler()

In [229]:
lr = LinearRegression()

In [230]:
pipe = make_pipeline(column_trans,scaler,lr)
pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [234]:
y_pred_lr = pipe.predict(X_test)

In [236]:
r2_score(y_test,y_pred_lr)

1.0