In [1]:
# Loading the dataset
import numpy as np
import pandas as pd
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [2]:
# Shape of the dataset
df.shape

(13320, 9)

In [3]:
# Finding duplicates values
df.duplicated().sum()

529

In [4]:
# Droppping Duplicates value from the dataset
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [5]:
# Finding missing values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5328
total_sqft         0
bath              73
balcony          605
price              0
dtype: int64

In [6]:
# Dropping null value of "size" column
df["size"] = df["size"].dropna()
df["size"].isnull().sum()

16

# Handling Missing  values  

In [7]:
df["area_type"].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [8]:
df["society"].unique()

array(['Coomee ', 'Theanmp', nan, ..., 'SJovest', 'ThhtsV ', 'RSntsAp'],
      dtype=object)

In [9]:
# handling Outlet_Size column's missing values with Outlet_Type column's mode values

mode_of_outlet_type = df.pivot_table(values="society",columns="area_type",aggfunc=(lambda x:x.mode()[0]))

missing_values = df["society"].isnull()

df.loc[missing_values,"society"] = df.loc[missing_values,"area_type"].apply(lambda x:mode_of_outlet_type[x])

In [10]:
df.isnull().sum()

area_type         0
availability      0
location          1
size             16
society           0
total_sqft        0
bath             73
balcony         605
price             0
dtype: int64

In [11]:
# Handling "bancony" column missing value with mode value
df["balcony"].mode()[0]

2.0

In [12]:
df["balcony"].fillna(df["balcony"].mode()[0], inplace=True)
df["balcony"].isnull().sum()

0

In [13]:
# Handling "bath" column missing value with mode value
df["bath"].mode()[0]

2.0

In [14]:
df["bath"].fillna(df["bath"].mode()[0], inplace=True)
df["bath"].isnull().sum()

0

In [15]:
# Dropping other column's missing value
df.dropna(inplace=True)

In [16]:
# Now finding missing value
df.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

# Correct the column data format


In [17]:
df["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [18]:
# Creating a function that will seeing which values of 'total_sqrt' columns will not be converted to float.

def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

df[~df['total_sqft'].apply(is_float)]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
30,Super built-up Area,19-Dec,Yelahanka,4 BHK,LedorSa,2100 - 2850,4.0,0.0,186.000
56,Built-up Area,20-Feb,Devanahalli,4 Bedroom,BrereAt,3010 - 3410,2.0,2.0,192.000
81,Built-up Area,18-Oct,Hennur Road,4 Bedroom,Gollela,2957 - 3450,2.0,2.0,224.500
122,Super built-up Area,18-Mar,Hebbal,4 BHK,SNontle,3067 - 8156,4.0,0.0,477.000
137,Super built-up Area,19-Mar,8th Phase JP Nagar,2 BHK,Vaarech,1042 - 1105,2.0,0.0,54.005
...,...,...,...,...,...,...,...,...,...
12975,Super built-up Area,20-Aug,Whitefield,2 BHK,Bhath N,850 - 1060,2.0,0.0,38.190
12990,Super built-up Area,18-May,Talaghattapura,3 BHK,Sodgere,1804 - 2273,3.0,0.0,122.000
13059,Super built-up Area,Ready To Move,Harlur,2 BHK,Shodsir,1200 - 1470,2.0,0.0,72.760
13240,Super built-up Area,Ready To Move,Devanahalli,1 BHK,Pardsri,1020 - 1130,2.0,2.0,52.570


In [19]:
# converting objects('1056','2100 - 2850') to float and float avarage,but this type of objects('1500Sq. Meter') will not be considered.

def convert_objects_to_num(x):
    token = x.split("-")
    if len(token)==2:
        return (float(token[0]) + float(token[1]))/2
    try:
        return float(x)
    except:
        return None
    
df["total_sqft"] = df["total_sqft"].apply(convert_objects_to_num)
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,Sryalan,1440.0,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,GrrvaGr,1200.0,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13314,Super built-up Area,Ready To Move,Green Glen Layout,3 BHK,SoosePr,1715.0,3.0,3.0,112.00
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453.0,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,GrrvaGr,3600.0,5.0,2.0,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141.0,2.0,1.0,60.00


In [20]:
df["size"].dtypes

dtype('O')

In [21]:
df["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [22]:
def bhk(x):
    token = x.split(" ")
    if token[1] == "BHK":
        return str(token[0]) + " " + "BHK"
    else:
        return str(token[0]) + " " + "BHK"

In [23]:
df["size"] = df["size"].apply(bhk)
df["size"].unique()

array(['2 BHK', '4 BHK', '3 BHK', '6 BHK', '1 BHK', '8 BHK', '7 BHK',
       '5 BHK', '11 BHK', '9 BHK', '27 BHK', '10 BHK', '19 BHK', '16 BHK',
       '43 BHK', '14 BHK', '12 BHK', '13 BHK', '18 BHK'], dtype=object)

In [24]:
# Rename the target column
df.rename(columns={"price":"price_in_lakh"}, inplace=True)
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price_in_lakh
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 BHK,Theanmp,2600.0,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,Sryalan,1440.0,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,GrrvaGr,1200.0,2.0,1.0,51.0


In [25]:
# Now finding missing value
df.isnull().sum()

area_type         0
availability      0
location          0
size              0
society           0
total_sqft       46
bath              0
balcony           0
price_in_lakh     0
dtype: int64

In [26]:
# Deleting missing values
df.dropna(inplace=True)
df.isnull().sum()

area_type        0
availability     0
location         0
size             0
society          0
total_sqft       0
bath             0
balcony          0
price_in_lakh    0
dtype: int64

In [27]:
# Extracting Dependent and Independent variables
x = df.drop(["price_in_lakh", "availability"], axis=1)
y = df["price_in_lakh"]

In [28]:
x

Unnamed: 0,area_type,location,size,society,total_sqft,bath,balcony
0,Super built-up Area,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0
1,Plot Area,Chikka Tirupathi,4 BHK,Theanmp,2600.0,5.0,3.0
2,Built-up Area,Uttarahalli,3 BHK,Sryalan,1440.0,2.0,3.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0
4,Super built-up Area,Kothanur,2 BHK,GrrvaGr,1200.0,2.0,1.0
...,...,...,...,...,...,...,...
13314,Super built-up Area,Green Glen Layout,3 BHK,SoosePr,1715.0,3.0,3.0
13315,Built-up Area,Whitefield,5 BHK,ArsiaEx,3453.0,4.0,0.0
13316,Super built-up Area,Richards Town,4 BHK,GrrvaGr,3600.0,5.0,2.0
13317,Built-up Area,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141.0,2.0,1.0


In [29]:
y

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13314    112.00
13315    231.00
13316    400.00
13317     60.00
13318    488.00
Name: price_in_lakh, Length: 12728, dtype: float64

In [30]:
# splitting the datasets into training and test sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=2023)

In [31]:
print(x.shape, x_train.shape, x_test.shape)

(12728, 7) (10182, 7) (2546, 7)


# Scikit-Learn Column Transformers and  Pipeline

In [32]:
# Importing libraries
from sklearn.compose import ColumnTransformer
# OneHotEncoder for categorical columns
from sklearn.preprocessing import OneHotEncoder
# StandardScaler for numerical columns
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [33]:
# Numeric df
numeric_df = x.select_dtypes(exclude="object")
numeric_df

Unnamed: 0,total_sqft,bath,balcony
0,1056.0,2.0,1.0
1,2600.0,5.0,3.0
2,1440.0,2.0,3.0
3,1521.0,3.0,1.0
4,1200.0,2.0,1.0
...,...,...,...
13314,1715.0,3.0,3.0
13315,3453.0,4.0,0.0
13316,3600.0,5.0,2.0
13317,1141.0,2.0,1.0


In [34]:
# Numeric features
numeric_features = x.select_dtypes(exclude="object").columns
numeric_features

Index(['total_sqft', 'bath', 'balcony'], dtype='object')

In [35]:
# Standard Scaler for numeric features through Pipeline
numeric_pipeline = Pipeline([("standardscaler", StandardScaler())])
numeric_pipeline

Pipeline(steps=[('standardscaler', StandardScaler())])

In [36]:
# Categorical df
categorical_df = x.select_dtypes("object")
categorical_df

Unnamed: 0,area_type,location,size,society
0,Super built-up Area,Electronic City Phase II,2 BHK,Coomee
1,Plot Area,Chikka Tirupathi,4 BHK,Theanmp
2,Built-up Area,Uttarahalli,3 BHK,Sryalan
3,Super built-up Area,Lingadheeranahalli,3 BHK,Soiewre
4,Super built-up Area,Kothanur,2 BHK,GrrvaGr
...,...,...,...,...
13314,Super built-up Area,Green Glen Layout,3 BHK,SoosePr
13315,Built-up Area,Whitefield,5 BHK,ArsiaEx
13316,Super built-up Area,Richards Town,4 BHK,GrrvaGr
13317,Built-up Area,Raja Rajeshwari Nagar,2 BHK,Mahla T


In [37]:
# Categorical features
categorical_features = x.select_dtypes("object").columns
categorical_features

Index(['area_type', 'location', 'size', 'society'], dtype='object')

In [38]:
# One Hot Encoding for numeric features through Pipeline
categorical_pipeline = Pipeline([("onehotencoder", OneHotEncoder(handle_unknown='ignore'))])
categorical_pipeline

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

In [39]:
# Column Transformers for numeric_pipeline and categorical_pipeline both pipelines
transformer = ColumnTransformer([("numeric_preprocessing", numeric_pipeline, numeric_features),
                                ("categorical_preprocessing", categorical_pipeline, categorical_features)])
transformer

ColumnTransformer(transformers=[('numeric_preprocessing',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 Index(['total_sqft', 'bath', 'balcony'], dtype='object')),
                                ('categorical_preprocessing',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['area_type', 'location', 'size', 'society'], dtype='object'))])

In [40]:
# Creating Scikit-Learn Pipeline
sk_pipeline = Pipeline([("all_column_preprocessing", transformer),
                       ("randomforestregressor", RandomForestRegressor())])
sk_pipeline

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  Index(['total_sqft', 'bath', 'balcony'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['area_type', 'location', 'size', 'society'], dtype='object'))])),
                ('randomforestregressor', RandomForestRegressor())])

In [41]:
# Fittng the dataset on Scikit-Leran Pipeline
sk_pipeline.fit(x, y)

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  Index(['total_sqft', 'bath', 'balcony'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['area_type', 'location', 'size', 'society'], dtype='object'))])),
                ('randomforestregressor', RandomForestRegressor())])

In [42]:
# Accuracy of the dataset
sk_pipeline.score(x, y)

0.9406023506863594

In [43]:
# prediction for dataset
sk_pipeline.predict(x)

array([ 39.81      , 123.95      ,  62.7577    , ..., 382.63791667,
        57.68543333, 447.715     ])

In [44]:
import pandas as pd

# Create a DataFrame with the same column names and structure as during training
input_data = pd.DataFrame([["Super built-up Area", "Electronic City Phase II", "2 BHK", "Coomee", 1056.0, 2.0, 1.0]],
                          columns=['area_type', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony'])

# Use the pipeline to make predictions
predictions = sk_pipeline.predict(input_data)

# Print the predictions
print(predictions)


[40.21064333]


# Save and Load the model through pickle

In [47]:
# save the model
import pickle as pk
with open("columntransformerpipelinemodel.pkl", "wb") as f:
    pk.dump(sk_pipeline, f)

In [48]:
# load the model
with open("columntransformerpipelinemodel.pkl", "rb") as f:
    pipeline_model = pk.load(f)

In [49]:
# Prediction for new custom data
import pandas as pd

# Create a DataFrame with the same column names and structure as during training
input_data = pd.DataFrame([["Super built-up Area", "Padmanabhanagar", "4 BHK", "SollyCl", 4689, 4, 1]],
                          columns=['area_type', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony'])

# Use the pipeline to make predictions
predictions = pipeline_model.predict(input_data)

# Print the predictions
print(predictions)

[463.475]
