**Importing Required Librariesm, reading data, and visualizing using pandas**

In [283]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [284]:
# pip install matplotlib

In [285]:
df=pd.read_csv("/content/Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [286]:
df.shape

(13320, 9)

In [287]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [288]:
# We drop the 16 null values from the size column
df.dropna(subset=['size','location'], inplace=True)
df.isna().sum()

area_type          0
availability       0
location           0
size               0
society         5499
total_sqft         0
bath              57
balcony          593
price              0
dtype: int64

In [None]:
df['society'].value_counts(ascending=False)

**Filling missing values in society column**

The society column has large number of missing values so rather than dropping the rows with null values I have imputed them with most frequent value in the column 

In [290]:
df['society']=df['society'].fillna('GrrvaGr')
df.isna().sum()

area_type         0
availability      0
location          0
size              0
society           0
total_sqft        0
bath             57
balcony         593
price             0
dtype: int64

In [291]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13303 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13303 non-null  object 
 1   availability  13303 non-null  object 
 2   location      13303 non-null  object 
 3   size          13303 non-null  object 
 4   society       13303 non-null  object 
 5   total_sqft    13303 non-null  object 
 6   bath          13246 non-null  float64
 7   balcony       12710 non-null  float64
 8   price         13303 non-null  float64
dtypes: float64(3), object(6)
memory usage: 1.0+ MB


In [292]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13246.0,12710.0,13303.0
mean,2.692586,1.584343,112.584033
std,1.341506,0.817287,148.99382
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


**Filling missing values in balcony and bath using mean of the columns**

In [293]:
df['balcony'].fillna(df['balcony'].mean(), inplace=True)
df['bath'].fillna(df['bath'].mean(), inplace=True)
df.isna().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [294]:
pd.set_option("display.max_rows", None)

In [295]:
df['total_sqft'].value_counts()

1200                 843
1100                 221
1500                 204
2400                 195
600                  180
1000                 172
1350                 132
1050                 123
1300                 117
1250                 114
900                  112
1400                 108
1800                 104
1150                 101
1600                 100
1140                  91
2000                  82
1450                  70
1650                  69
800                   67
3000                  66
1075                  66
1020                  63
2500                  62
1550                  60
1160                  60
1125                  60
950                   59
1180                  58
1700                  58
1260                  57
1255                  56
1220                  55
1080                  55
1070                  53
700                   52
750                   52
1225                  48
4000                  48
1175                  48


**Handling wrong format values in the total_sqft column like it contains 'Acres', 'Guntha', 'Cents',  also handled such values in size column**

Replaced all such values and kept numerical value only 

In [296]:
df['total_sqft'] = df['total_sqft'].replace('[Acres]', '', regex=True).astype(object)
df['total_sqft'] = df['total_sqft'].replace('[Guntha]', '', regex=True).astype(object)
df['total_sqft'] = df['total_sqft'].replace('[Sq. Meter]', '', regex=True).astype(object)
df['total_sqft'] = df['total_sqft'].replace('[Sq. Yards]', '', regex=True).astype(object)
df['total_sqft'] = df['total_sqft'].replace('[Cents]', '', regex=True).astype(object)
df['total_sqft'] = df['total_sqft'].replace('[Perch]', '', regex=True).astype(object)
df['total_sqft'] = df['total_sqft'].replace('[Grounds]', '', regex=True).astype(object)
df['size']=df['size'].replace([' BHK'],"",regex=True).astype(object)
df['size']=df['size'].replace(['Bedroom'],"",regex=True).astype(object)

In [297]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,GrrvaGr,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2,GrrvaGr,1200,2.0,1.0,51.0


**Replaced the date format in availability column**

Replaced it with Not Ready to Move and finally only kept two values in the column (Not ready to Move, Ready to Move)

In [298]:
conditions=[(df['availability']=='Ready To Move'),(df['availability']!='Ready To Move')]
values=['Ready To Move', 'Not Ready To Move']
df['final_avail']=np.select(conditions, values)
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,final_avail
0,Super built-up Area,19-Dec,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07,Not Ready To Move
1,Plot Area,Ready To Move,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0,Ready To Move
2,Built-up Area,Ready To Move,Uttarahalli,3,GrrvaGr,1440,2.0,3.0,62.0,Ready To Move
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0,Ready To Move
4,Super built-up Area,Ready To Move,Kothanur,2,GrrvaGr,1200,2.0,1.0,51.0,Ready To Move


In [299]:
df.drop(['availability'], inplace=True, axis=1)
df.head()

Unnamed: 0,area_type,location,size,society,total_sqft,bath,balcony,price,final_avail
0,Super built-up Area,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07,Not Ready To Move
1,Plot Area,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0,Ready To Move
2,Built-up Area,Uttarahalli,3,GrrvaGr,1440,2.0,3.0,62.0,Ready To Move
3,Super built-up Area,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0,Ready To Move
4,Super built-up Area,Kothanur,2,GrrvaGr,1200,2.0,1.0,51.0,Ready To Move


In [300]:
df.shape

(13303, 9)

**Dropped the values in the format of range from total_sqft column (X-Y)**

In [301]:
index=df[df['total_sqft'].str.contains('[-]', regex=True)].index
df.drop(index, inplace=True)
df.shape

(13110, 9)

In [None]:
df['total_sqft'].value_counts()

In [303]:
df.head()

Unnamed: 0,area_type,location,size,society,total_sqft,bath,balcony,price,final_avail
0,Super built-up Area,Electronic City Phase II,2,Coomee,1056,2.0,1.0,39.07,Not Ready To Move
1,Plot Area,Chikka Tirupathi,4,Theanmp,2600,5.0,3.0,120.0,Ready To Move
2,Built-up Area,Uttarahalli,3,GrrvaGr,1440,2.0,3.0,62.0,Ready To Move
3,Super built-up Area,Lingadheeranahalli,3,Soiewre,1521,3.0,1.0,95.0,Ready To Move
4,Super built-up Area,Kothanur,2,GrrvaGr,1200,2.0,1.0,51.0,Ready To Move


**Converted the data type of some object column into float and numeric**

In [304]:
df['total_sqft']=df['total_sqft'].astype(float)

In [None]:
index=df[df['size'].str.contains('[RK]', regex=True)].index
df.drop(index, inplace=True)
df['size'].value_counts()

In [306]:
df.shape

(13098, 9)

In [307]:
df['size']=df['size'].astype(int)

In [308]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13098 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   area_type    13098 non-null  object 
 1   location     13098 non-null  object 
 2   size         13098 non-null  int64  
 3   society      13098 non-null  object 
 4   total_sqft   13098 non-null  float64
 5   bath         13098 non-null  float64
 6   balcony      13098 non-null  float64
 7   price        13098 non-null  float64
 8   final_avail  13098 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 1023.3+ KB


**Installing and using category encoders to convert the categorical columns into numeric values**

In [309]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [310]:
import category_encoders as ce 

In [311]:
encode=ce.OrdinalEncoder(cols=['location', 'area_type','final_avail','society'])
df=encode.fit_transform(df)
df_transformed=encode.transform(df)
df.head()

Unnamed: 0,area_type,location,size,society,total_sqft,bath,balcony,price,final_avail
0,1,1,2,1,1056.0,2.0,1.0,39.07,1
1,2,2,4,2,2600.0,5.0,3.0,120.0,2
2,3,3,3,3,1440.0,2.0,3.0,62.0,2
3,1,4,3,4,1521.0,3.0,1.0,95.0,2
4,1,5,2,3,1200.0,2.0,1.0,51.0,2


**Splitting the data into training and testing sets**

In [312]:
from sklearn.model_selection import train_test_split
X=df.drop(['price'], axis=1)
y=df['price']

In [313]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3)