In [247]:
import pandas as pd
import numpy as np

In [248]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [249]:
data= pd.read_csv("/content/Bengaluru_House_Data.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [250]:
data.shape

(13320, 9)

In [251]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [252]:
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [253]:
for colum in data.columns:
  print(data[colum].value_counts())

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547


In [254]:
data.drop(["area_type", "availability", "balcony", "society"], axis = 1, inplace=True)

In [255]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [256]:
data["location"]=data["location"].fillna("Whitefield")

In [257]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [258]:
data["size"] = data["size"].fillna("2 BHK")

In [259]:
data["bath"] = data["bath"].fillna(data["bath"].median())

In [260]:
data["size"] = data["size"].str.split().str.get(0).astype(int)

In [261]:
data["total_sqft"] = data["total_sqft"].fillna(2.0)

In [262]:
def convert_range(x):
  temp =  x.split("-")
  if len(temp)==2:
    return (float(temp[0])+float(temp[1]))/2
  try :
    return float(x)
  except:
    return None

In [263]:
data["total_sqft"] = data["total_sqft"].apply(convert_range)

In [264]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.0
2,Uttarahalli,3,1440.0,2.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,95.0
4,Kothanur,2,1200.0,2.0,51.0


In [265]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  int64  
 2   total_sqft  13274 non-null  float64
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 520.4+ KB


In [266]:
data["price_per_sqft"] = data["price"]*100000 /data["total_sqft"]

In [267]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,39.07,3699.810606
1,Chikka Tirupathi,4,2600.0,5.0,120.0,4615.384615
2,Uttarahalli,3,1440.0,2.0,62.0,4305.555556
3,Lingadheeranahalli,3,1521.0,3.0,95.0,6245.890861
4,Kothanur,2,1200.0,2.0,51.0,4250.0


In [268]:
data["location"] = data["location"].apply(lambda x: x.split()[0])
location_count = data["location"].value_counts()

In [269]:
location_count

Whitefield       543
Sarjapur         489
Electronic       440
Kanakpura        273
Yelahanka        254
                ... 
Housing            1
1Channasandra      1
Vijayabank         1
Saptagiri          1
Abshot             1
Name: location, Length: 1048, dtype: int64

In [270]:
location_count_less10  = location_count[location_count<10]
location_count_less10

Gollahalli       9
Bagalur          9
T                9
Chennammana      9
Peenya           9
                ..
Housing          1
1Channasandra    1
Vijayabank       1
Saptagiri        1
Abshot           1
Name: location, Length: 809, dtype: int64

In [271]:
data["location"] = data["location"].apply(lambda x : "other" if x in location_count_less10 else x)

In [272]:
data["location"].value_counts()

other              2243
Whitefield          543
Sarjapur            489
Electronic          440
Kanakpura           273
                   ... 
Sadashiva            10
Naganathapura        10
Dodsworth            10
Nagappa              10
Nagadevanahalli      10
Name: location, Length: 240, dtype: int64

In [273]:
data.describe()

Unnamed: 0,size,total_sqft,bath,price,price_per_sqft
count,13320.0,13274.0,13320.0,13320.0,13274.0
mean,2.802778,1559.626694,2.688814,112.565627,7907.501
std,1.294496,1238.405258,1.338754,148.971674,106429.6
min,1.0,1.0,1.0,8.0,267.8298
25%,2.0,1100.0,2.0,50.0,4266.865
50%,3.0,1276.0,2.0,72.0,5434.306
75%,3.0,1680.0,3.0,120.0,7311.746
max,43.0,52272.0,40.0,3600.0,12000000.0


In [274]:
data = data[((data["total_sqft"]/data["size"]) >=300)]
data.describe()

Unnamed: 0,size,total_sqft,bath,price,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,2.650838,1594.564544,2.559537,111.382401,6303.979357
std,0.976678,1261.271296,1.077938,152.077329,4162.237981
min,1.0,300.0,1.0,8.44,267.829813
25%,2.0,1116.0,2.0,49.0,4210.526316
50%,3.0,1300.0,2.0,70.0,5294.117647
75%,3.0,1700.0,3.0,115.0,6916.666667
max,16.0,52272.0,16.0,3600.0,176470.588235


In [275]:
data.shape

(12530, 6)

In [276]:
def remove_outliers_sqft(df):
  df_output= pd.DataFrame()
  for key,subdf in df.groupby("location"):
      n =np.mean(subdf.price_per_sqft)
      st = np.std(subdf.price_per_sqft)

      gen_df = subdf[(subdf.price_per_sqft > (n-st)) & (subdf.price_per_sqft <= (n+st))]
      df_output = pd.concat([df_output,gen_df],ignore_index = True)
  return df_output
data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,size,total_sqft,bath,price,price_per_sqft
count,10341.0,10341.0,10341.0,10341.0,10341.0
mean,2.5777,1515.00507,2.475486,91.864638,5662.927625
std,0.88894,903.793919,0.975177,90.127575,2278.616792
min,1.0,300.0,1.0,10.0,729.860414
25%,2.0,1110.0,2.0,49.0,4245.283019
50%,2.0,1290.0,2.0,67.0,5179.282869
75%,3.0,1650.0,3.0,100.0,6436.314363
max,13.0,30400.0,13.0,2200.0,25641.025641


In [277]:
data.drop("price_per_sqft", inplace = True, axis = 1)

In [278]:
data.to_csv("Clean_data.csv")

In [279]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,1st,4,2850.0,4.0,428.0
1,1st,3,1630.0,3.0,194.0
2,1st,4,2825.0,4.0,250.0
3,1st,2,1415.0,2.0,110.0
4,1st,3,1875.0,3.0,167.0


In [280]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10341 entries, 0 to 10340
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    10341 non-null  object 
 1   size        10341 non-null  int64  
 2   total_sqft  10341 non-null  float64
 3   bath        10341 non-null  float64
 4   price       10341 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 404.1+ KB


In [281]:
x = data.drop("price", axis =1)
y =data["price"]

In [282]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.2,random_state =0)

In [283]:
col_trans = ColumnTransformer(transformers=[
    ("encoder", OneHotEncoder(sparse = False), ["location"])
], remainder = "passthrough")

In [284]:
scalar = StandardScaler()

In [285]:
lr = LinearRegression()


In [286]:
pipe = make_pipeline(col_trans,scalar,lr)

In [287]:
pipe.fit(x_train,y_train)




In [288]:
y_pred_lr =pipe.predict(x_test)

In [289]:
r2_score(y_test,y_pred_lr)

0.7759369615933177

In [290]:
lasso = Lasso()

In [291]:
pipe = make_pipeline(col_trans,scalar, lasso)

In [292]:
pipe.fit(x_train,y_train)



In [293]:
y_pred_lasso =pipe.predict(x_test)

In [294]:
r2_score(y_test,y_pred_lasso)

0.7624438425391786

In [295]:
ridge = Ridge()

In [296]:
pipe = make_pipeline(col_trans,scalar, ridge)

In [297]:
pipe.fit(x_train,y_train)



In [298]:
y_pred_ridge=pipe.predict(x_test)

In [299]:
r2_score(y_test,y_pred_ridge)

0.7762349972533845

In [300]:
rc = RandomForestRegressor(n_estimators=10)

In [301]:
pipe = make_pipeline(col_trans,scalar, rc)

In [302]:
pipe.fit(x_train,y_train)



In [303]:
y_pred_rc =pipe.predict(x_test)

In [304]:
r2_score(y_test,y_pred_rc)

0.8009204256718283

In [305]:
# from xgboost import XGBRegressor
# xg = XGBRegressor()

In [306]:
# pipe = make_pipeline(col_trans,scalar, xg)

In [307]:
# pipe.fit(x_train,y_train)

In [308]:
# y_pred_xg =pipe.predict(x_test)

In [309]:
# r2_score(y_test,y_pred_xg)

In [310]:
print("LinerRegression : ", r2_score(y_test,y_pred_lr))
print("Lasso : ", r2_score(y_test,y_pred_lasso))
print("Ridge : ", r2_score(y_test,y_pred_ridge))
print("RandomForest : ", r2_score(y_test,y_pred_rc))
# print("XGboost : ", r2_score(y_test,y_pred_xg))

LinerRegression :  0.7759369615933177
Lasso :  0.7624438425391786
Ridge :  0.7762349972533845
RandomForest :  0.8009204256718283


In [311]:
import pickle

In [312]:
pickle.dump(pipe,open("XG.pkl", "wb"))