<a href="https://colab.research.google.com/github/NotRay-67/MachineLearning/blob/main/Real_Estate_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

# Data Load

In [None]:
df1 = pd.read_csv("/content/drive/MyDrive/my files/Bengaluru_House_Data.xls")
df1.head()
df1.shape

(13320, 9)

In [None]:
df2 = df1.drop(['area_type','availability','society'],axis='columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


# Data Cleaning

In [None]:
# Handling Null values
df2.isnull().sum()

df2 = df2.dropna()
df2.isnull().sum()


location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [None]:
df2.nunique()
df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [None]:
def bhk(x):
    return int(x.split(' ')[0])
df3 = df2.copy()
df3['bhk'] = df2['size'].apply(bhk)
df3 = df3.drop(['size'],axis='columns')
print(df3['bhk'].unique())
df3

[ 2  4  3  1  6  8  7  5 11  9 27 43 14 12 10 13]


Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600,5.0,3.0,120.00,4
2,Uttarahalli,1440,2.0,3.0,62.00,3
3,Lingadheeranahalli,1521,3.0,1.0,95.00,3
4,Kothanur,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...
13314,Green Glen Layout,1715,3.0,3.0,112.00,3
13315,Whitefield,3453,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,1141,2.0,1.0,60.00,2
13318,Padmanabhanagar,4689,4.0,1.0,488.00,4


In [None]:
df3['total_sqft'].nunique()

1976

In [None]:
def isfloat(x):
  try:
    float(x)
  except:
    return False
  return True

In [None]:
# this provides us with the data that cannot be converted back to float
df3[~df3['total_sqft'].apply(isfloat)].nunique()

location       88
total_sqft    169
bath            8
balcony         4
price         175
bhk             8
dtype: int64

In [None]:
df3[~df3['total_sqft'].apply(isfloat)]

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
30,Yelahanka,2100 - 2850,4.0,0.0,186.000,4
122,Hebbal,3067 - 8156,4.0,0.0,477.000,4
137,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,0.0,43.490,2
188,KR Puram,1015 - 1540,2.0,0.0,56.800,2
...,...,...,...,...,...,...
12975,Whitefield,850 - 1060,2.0,0.0,38.190,2
12990,Talaghattapura,1804 - 2273,3.0,0.0,122.000,3
13059,Harlur,1200 - 1470,2.0,0.0,72.760,2
13265,Hoodi,1133 - 1384,2.0,0.0,59.135,2


In [None]:
def convert_sqft_to_float(x):
  token = x.split(" - ")
  if len(token) == 2:
    return (float(token[0])+float(token[1]))/2
  try:
    return float(x)
  except:
    return None


In [None]:
print(convert_sqft_to_float("2100 - 2850"))
convert_sqft_to_float("34.66 sqft")

2475.0


In [None]:
df4 = df3.copy()
df4["total_sqft"] = df3["total_sqft"].apply(convert_sqft_to_float)
df4.isnull().sum()
# these null values are the " 34.66sqft " type values

location       0
total_sqft    42
bath           0
balcony        0
price          0
bhk            0
dtype: int64

In [None]:
df4 = df4.dropna(subset="total_sqft")
df4.isnull().sum()

location      0
total_sqft    0
bath          0
balcony       0
price         0
bhk           0
dtype: int64

In [None]:
print(df4.shape)
df4

(12668, 6)


Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.00,4
2,Uttarahalli,1440.0,2.0,3.0,62.00,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.00,3
4,Kothanur,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...
13314,Green Glen Layout,1715.0,3.0,3.0,112.00,3
13315,Whitefield,3453.0,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,1141.0,2.0,1.0,60.00,2
13318,Padmanabhanagar,4689.0,4.0,1.0,488.00,4


# Feature Engineering

In [None]:
df4

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.00,4
2,Uttarahalli,1440.0,2.0,3.0,62.00,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.00,3
4,Kothanur,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...
13314,Green Glen Layout,1715.0,3.0,3.0,112.00,3
13315,Whitefield,3453.0,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,1141.0,2.0,1.0,60.00,2
13318,Padmanabhanagar,4689.0,4.0,1.0,488.00,4
