In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
house_price =  pd.read_csv("Bengaluru_House_Data.csv")
house_price.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
house_price.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


# bath seemed confusing so lets replace the name to bathrooms

In [6]:
house_price.rename(columns = {'bath':'bathroom'},inplace = True)
house_price.head(3)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bathroom,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0


In [7]:
house_price.drop(['availability'],axis = 1,inplace = True)

In [8]:
house_price

Unnamed: 0,area_type,location,size,society,total_sqft,bathroom,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...
13315,Built-up Area,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [9]:
house_price["location"].describe()

count          13319
unique          1305
top       Whitefield
freq             540
Name: location, dtype: object

In [10]:
house_price["location"].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [11]:
house_price.isnull().sum()

area_type        0
location         1
size            16
society       5502
total_sqft       0
bathroom        73
balcony        609
price            0
dtype: int64

In [12]:
df2 = house_price.drop(["area_type","location","society","balcony"],axis = 'columns')

In [13]:
df2.head(3)

Unnamed: 0,size,total_sqft,bathroom,price
0,2 BHK,1056,2.0,39.07
1,4 Bedroom,2600,5.0,120.0
2,3 BHK,1440,2.0,62.0


In [14]:
df2.isnull().sum()

size          16
total_sqft     0
bathroom      73
price          0
dtype: int64

# As the dataset has 13320 rows and the number of rows containing null values is very less, so it is a good practise to drop them

In [15]:
df2.dropna(axis = 'rows')

Unnamed: 0,size,total_sqft,bathroom,price
0,2 BHK,1056,2.0,39.07
1,4 Bedroom,2600,5.0,120.00
2,3 BHK,1440,2.0,62.00
3,3 BHK,1521,3.0,95.00
4,2 BHK,1200,2.0,51.00
...,...,...,...,...
13315,5 Bedroom,3453,4.0,231.00
13316,4 BHK,3600,5.0,400.00
13317,2 BHK,1141,2.0,60.00
13318,4 BHK,4689,4.0,488.00


In [64]:
#df['DataFrame Column'] = df['DataFrame Column'].astype(float)

In [17]:
df2['total_sqft']=df2['total_sqft'].astype(float)

ValueError: could not convert string to float: '2100 - 2850'

# Feature Engineering

# A new column named Price per sqft is added .

In [18]:
df3 = df2.copy()
df3['Price per sqft'] = df3['price'] * 100000 / df3['total_sqft']

TypeError: unsupported operand type(s) for /: 'float' and 'str'

In [20]:
df3

Unnamed: 0,size,total_sqft,bathroom,price
0,2 BHK,1056,2.0,39.07
1,4 Bedroom,2600,5.0,120.00
2,3 BHK,1440,2.0,62.00
3,3 BHK,1521,3.0,95.00
4,2 BHK,1200,2.0,51.00
...,...,...,...,...
13315,5 Bedroom,3453,4.0,231.00
13316,4 BHK,3600,5.0,400.00
13317,2 BHK,1141,2.0,60.00
13318,4 BHK,4689,4.0,488.00


In [19]:
def bhk(x):
    token = x.split()
    return token[0]

In [3]:
'''pd.apply(func, convert_dtype=True, args=())'''

'pd.apply(func, convert_dtype=True, args=())'

In [None]:
df4['total_sqft'] = df4["total_sqft"].apply(convert_sqft_to_num)

In [4]:
'''df3['bhk'] = df3["size"].apply(bhk)'''

'df3[\'bhk\'] = df3["size"].apply(bhk)'

In [22]:
'''Applying Linear Regression model just for fun'''
X= df3.drop(["price"],axis = 'columns')


In [23]:
X

Unnamed: 0,size,total_sqft,bathroom
0,2 BHK,1056,2.0
1,4 Bedroom,2600,5.0
2,3 BHK,1440,2.0
3,3 BHK,1521,3.0
4,2 BHK,1200,2.0
...,...,...,...
13315,5 Bedroom,3453,4.0
13316,4 BHK,3600,5.0
13317,2 BHK,1141,2.0
13318,4 BHK,4689,4.0


In [24]:
X.shape

(13320, 3)

In [25]:
y=df3.price

In [26]:
y

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13315    231.00
13316    400.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 13320, dtype: float64

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)


In [28]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

ValueError: could not convert string to float: '3 BHK'

In [33]:
df3["size"].dtype

dtype('O')

In [None]:
df3[]