In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [50]:
df = pd.read_csv('Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [51]:
df.shape

(545, 13)

In [52]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [53]:
df.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

In [54]:
df['parking'].value_counts()

0    299
1    126
2    108
3     12
Name: parking, dtype: int64

### Label Encoding

This method is used to encode categorical (object data types) values to int/float values

In [55]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [56]:
print(df['mainroad'].unique())
print(df['guestroom'].unique())
print(df['basement'].unique())
print(df['hotwaterheating'].unique())
print(df['airconditioning'].unique())
print(df['prefarea'].unique())
print(df['furnishingstatus'].unique())

['yes' 'no']
['no' 'yes']
['no' 'yes']
['no' 'yes']
['yes' 'no']
['yes' 'no']
['furnished' 'semi-furnished' 'unfurnished']


In [57]:
df['mainroad'].value_counts()

yes    468
no      77
Name: mainroad, dtype: int64

In [58]:
df['guestroom'].value_counts()

no     448
yes     97
Name: guestroom, dtype: int64

In [59]:
df['basement'].value_counts()

no     354
yes    191
Name: basement, dtype: int64

In [60]:
df['hotwaterheating'].value_counts()

no     520
yes     25
Name: hotwaterheating, dtype: int64

In [61]:
df['airconditioning'].value_counts()

no     373
yes    172
Name: airconditioning, dtype: int64

In [62]:
df['prefarea'].value_counts()

no     417
yes    128
Name: prefarea, dtype: int64

In [63]:
df['furnishingstatus'].value_counts()

semi-furnished    227
unfurnished       178
furnished         140
Name: furnishingstatus, dtype: int64

In [64]:
from sklearn.preprocessing import LabelEncoder

In [65]:
lb = LabelEncoder()

In [66]:
df['mainroad'] = lb.fit_transform(df['mainroad'])
df['guestroom'] = lb.fit_transform(df['guestroom'])
df['basement'] = lb.fit_transform(df['basement'])
df['hotwaterheating'] = lb.fit_transform(df['hotwaterheating'])
df['airconditioning'] = lb.fit_transform(df['airconditioning'])
df['prefarea'] = lb.fit_transform(df['prefarea'])
df['furnishingstatus'] = lb.fit_transform(df['furnishingstatus'])

In [67]:
df.dtypes

price               int64
area                int64
bedrooms            int64
bathrooms           int64
stories             int64
mainroad            int64
guestroom           int64
basement            int64
hotwaterheating     int64
airconditioning     int64
parking             int64
prefarea            int64
furnishingstatus    int64
dtype: object

In [68]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [69]:
df['mainroad'].value_counts()
# yes - 468
# no - 77

1    468
0     77
Name: mainroad, dtype: int64

In [70]:
df['guestroom'].value_counts()

0    448
1     97
Name: guestroom, dtype: int64

In [71]:
df['basement'].value_counts()

0    354
1    191
Name: basement, dtype: int64

## Model 1

In [72]:
x = df.iloc[:,1:]
y = df.iloc[:,0]
print(x.shape)
print(y.shape)
print(type(x))
print(type(y))

(545, 12)
(545,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [73]:
x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1
3,7500,4,2,2,1,0,1,0,1,3,1,0
4,7420,4,1,2,1,1,1,0,1,2,0,0


In [74]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [75]:
df.shape

(545, 13)

In [76]:
from sklearn.model_selection import train_test_split

In [77]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.75)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(408, 12)
(137, 12)
(408,)
(137,)


In [78]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [79]:
m1 = LinearRegression()
m1.fit(x_train,y_train)

LinearRegression()

In [80]:
## R2 Score
print('Training Score',m1.score(x_train,y_train))
print('Testing Score',m1.score(x_test,y_test))

Training Score 0.6849650097464943
Testing Score 0.6512308492155645


In [81]:
ypred_m1 = m1.predict(x_test)
print(len(ypred_m1))

137


In [82]:
print('MSE',mean_squared_error(y_test,ypred_m1))
print('RMSE',np.sqrt(mean_squared_error(y_test,ypred_m1)))
print('MAE',mean_absolute_error(y_test,ypred_m1))
print('R2_score',r2_score(y_test,ypred_m1))
print('Testing Score',m1.score(x_test,y_test))

MSE 997095028305.4259
RMSE 998546.4577601915
MAE 733286.0825496258
R2_score 0.6512308492155645
Testing Score 0.6512308492155645


In [83]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [84]:
df.sample(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
101,6230000,5500,3,1,3,1,0,0,0,0,1,1,2
245,4550000,5360,3,1,2,1,0,0,0,0,2,1,2
201,4900000,4095,3,1,2,0,1,1,0,1,0,0,1
264,4410000,4900,2,1,2,1,0,1,0,0,0,0,1


In [85]:
m1.predict([[5430,5,3,2,1,0,1,0,1,2,1,2]])

  "X does not have valid feature names, but"


array([8395390.29871338])

### Model 2

Correlation

It varies between -1 and +1.</br>
Value closer to +1 are considered appropriate for the model but not too close to 1.</br> Value closer to -1 should be right away discarded.</br>
It is a parameter of relation and strength between 2 or more variables.</br>
Correlation of x with x is always 1

In [86]:
# Correlation
df.corr()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
price,1.0,0.535997,0.366494,0.517545,0.420712,0.296898,0.255517,0.187057,0.093073,0.452954,0.384394,0.329777,-0.304721
area,0.535997,1.0,0.151858,0.19382,0.083996,0.288874,0.140297,0.047417,-0.009229,0.222393,0.35298,0.234779,-0.171445
bedrooms,0.366494,0.151858,1.0,0.37393,0.408564,-0.012033,0.080549,0.097312,0.046049,0.160603,0.13927,0.079023,-0.123244
bathrooms,0.517545,0.19382,0.37393,1.0,0.326165,0.042398,0.126469,0.102106,0.067159,0.186915,0.177496,0.063472,-0.143559
stories,0.420712,0.083996,0.408564,0.326165,1.0,0.121706,0.043538,-0.172394,0.018847,0.293602,0.045547,0.044425,-0.104672
mainroad,0.296898,0.288874,-0.012033,0.042398,0.121706,1.0,0.092337,0.044002,-0.011781,0.105423,0.204433,0.199876,-0.156726
guestroom,0.255517,0.140297,0.080549,0.126469,0.043538,0.092337,1.0,0.372066,-0.010308,0.138179,0.037466,0.160897,-0.118328
basement,0.187057,0.047417,0.097312,0.102106,-0.172394,0.044002,0.372066,1.0,0.004385,0.047341,0.051497,0.228083,-0.112831
hotwaterheating,0.093073,-0.009229,0.046049,0.067159,0.018847,-0.011781,-0.010308,0.004385,1.0,-0.130023,0.067864,-0.059411,-0.031628
airconditioning,0.452954,0.222393,0.160603,0.186915,0.293602,0.105423,0.138179,0.047341,-0.130023,1.0,0.159173,0.117382,-0.150477


In [87]:
df.corr()['price']

price               1.000000
area                0.535997
bedrooms            0.366494
bathrooms           0.517545
stories             0.420712
mainroad            0.296898
guestroom           0.255517
basement            0.187057
hotwaterheating     0.093073
airconditioning     0.452954
parking             0.384394
prefarea            0.329777
furnishingstatus   -0.304721
Name: price, dtype: float64

### Threshold=> Corr()>0.2

In [88]:
# Columns with Corr()>0.2
# area, bedrooms, bathrooms, stories, airconditioning, parking, prefarea

In [89]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [90]:
x = df.iloc[:,[1,2,3,4,9,10,11]]
y = df.iloc[:,0]
print(x.shape)
print(y.shape)
print(type(x))
print(type(y))

(545, 7)
(545,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [91]:
x.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,airconditioning,parking,prefarea
0,7420,4,2,3,1,2,1
1,8960,4,4,4,1,3,0
2,9960,3,2,2,0,2,1
3,7500,4,2,2,1,3,1
4,7420,4,1,2,1,2,0


In [92]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [94]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.25, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(136, 7)
(409, 7)
(136,)
(409,)


In [95]:
m2 = LinearRegression()
m2.fit(x_train,y_train)

LinearRegression()

In [96]:
# R2 Score
print('Training Score',m2.score(x_train,y_train))
print('Testing Score',m2.score(x_test,y_test))

Training Score 0.6495185437976951
Testing Score 0.6233704703390932


In [97]:
ypred_m2 = m2.predict(x_test)
print(len(ypred_m2))

409


In [98]:
print('MSE',mean_squared_error(y_test,ypred_m2))
print('RMSE',np.sqrt(mean_squared_error(y_test,ypred_m2)))
print('MAE',mean_absolute_error(y_test,ypred_m2))
print('R2_score',r2_score(y_test,ypred_m2))
print('Testing score',m2.score(x_test,y_test))

MSE 1325109079687.8552
RMSE 1151133.823535672
MAE 840784.1571235132
R2_score 0.6233704703390932
Testing score 0.6233704703390932


### Exercise

1) For Housing.csv dataset apply</br>
a) Decision Tree Regression</br>
b) RandomForest Regression</br>
c) KNN Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
# (criterion= 'mse' or 'mae')
from sklearn.ensemble import RandomForestRegressor
# (criterion= 'mse' or 'mae')
from sklearn.neighbors import KNeighborsRegressor