In [22]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [4]:
prices={'town':['pune','pune','pune','pune','pune','Mumbai','Mumbai','Mumbai','Mumbai','Mumbai','Delhi','Delhi','Delhi','Delhi','Delhi'],
        'area':[2600,3000,3200,3600,4000,2600,3000,3200,3600,4000,2600,3000,3200,3600,4000],
        'prices':[550000,565000,610000,680000,725000,650050,765000,790000,800000,825000,557000,585000,710000,780000,825000]}

In [5]:
df=pd.DataFrame(prices)

In [6]:
df

Unnamed: 0,town,area,prices
0,pune,2600,550000
1,pune,3000,565000
2,pune,3200,610000
3,pune,3600,680000
4,pune,4000,725000
5,Mumbai,2600,650050
6,Mumbai,3000,765000
7,Mumbai,3200,790000
8,Mumbai,3600,800000
9,Mumbai,4000,825000


In [16]:
dummies=pd.get_dummies(df.town)  #creating dummy variables

In [12]:
merged=pd.concat([df,dummies],axis='columns')

In [13]:
merged

Unnamed: 0,town,area,prices,Delhi,Mumbai,pune
0,pune,2600,550000,False,False,True
1,pune,3000,565000,False,False,True
2,pune,3200,610000,False,False,True
3,pune,3600,680000,False,False,True
4,pune,4000,725000,False,False,True
5,Mumbai,2600,650050,False,True,False
6,Mumbai,3000,765000,False,True,False
7,Mumbai,3200,790000,False,True,False
8,Mumbai,3600,800000,False,True,False
9,Mumbai,4000,825000,False,True,False


In [14]:
final=merged.drop(['town'],axis='columns')

In [15]:
final

Unnamed: 0,area,prices,Delhi,Mumbai,pune
0,2600,550000,False,False,True
1,3000,565000,False,False,True
2,3200,610000,False,False,True
3,3600,680000,False,False,True
4,4000,725000,False,False,True
5,2600,650050,False,True,False
6,3000,765000,False,True,False
7,3200,790000,False,True,False
8,3600,800000,False,True,False
9,4000,825000,False,True,False


In [18]:
final=final.replace({True:1,False:0})

In [19]:
final

Unnamed: 0,area,prices,Delhi,Mumbai,pune
0,2600,550000,0,0,1
1,3000,565000,0,0,1
2,3200,610000,0,0,1
3,3600,680000,0,0,1
4,4000,725000,0,0,1
5,2600,650050,0,1,0
6,3000,765000,0,1,0
7,3200,790000,0,1,0
8,3600,800000,0,1,0
9,4000,825000,0,1,0


As we are using one-hot encoding, we use dummy variable to create dummy attributes, these attributes are multicollinear or highly correlated attributes, hence, by this we can say that occurence of one says the absence of other, showing the highly correlated nature of the attributes. When using a regression model this approach might take it to the dummy variable trap hence to avoid this we remove any 1 of the dummy variable from the n dummy variables created.

In [20]:
final=final.drop(['Delhi'],axis='columns')

In [21]:
final

Unnamed: 0,area,prices,Mumbai,pune
0,2600,550000,0,1
1,3000,565000,0,1
2,3200,610000,0,1
3,3600,680000,0,1
4,4000,725000,0,1
5,2600,650050,1,0
6,3000,765000,1,0
7,3200,790000,1,0
8,3600,800000,1,0
9,4000,825000,1,0


In [23]:
reg=linear_model.LinearRegression()

In [25]:
x=final.drop(['prices'],axis='columns')
y=final['prices']
x

Unnamed: 0,area,Mumbai,pune
0,2600,0,1
1,3000,0,1
2,3200,0,1
3,3600,0,1
4,4000,0,1
5,2600,1,0
6,3000,1,0
7,3200,1,0
8,3600,1,0
9,4000,1,0


In [26]:
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     650050
6     765000
7     790000
8     800000
9     825000
10    557000
11    585000
12    710000
13    780000
14    825000
Name: prices, dtype: int64

In [27]:
reg.fit(x,y)

In [29]:
reg.predict([[5000,0,1]])



array([887831.71232877])

In [30]:
reg.predict([[4000,1,0]])



array([875613.97260274])

In [31]:
reg.predict([[5000,0,0]])



array([953231.71232877])

In [32]:
reg.score(x,y)

0.8945419591010071

## Using one hot encoding from sklearn
#### Label Encoding

In [34]:
from sklearn.preprocessing import LabelEncoder

In [35]:
le=LabelEncoder()

In [36]:
dfle=df
dfle['town']=le.fit_transform(dfle['town'])

In [37]:
dfle

Unnamed: 0,town,area,prices
0,2,2600,550000
1,2,3000,565000
2,2,3200,610000
3,2,3600,680000
4,2,4000,725000
5,1,2600,650050
6,1,3000,765000
7,1,3200,790000
8,1,3600,800000
9,1,4000,825000


In [53]:
x=dfle[['town','area']].values

x

array([[   2, 2600],
       [   2, 3000],
       [   2, 3200],
       [   2, 3600],
       [   2, 4000],
       [   1, 2600],
       [   1, 3000],
       [   1, 3200],
       [   1, 3600],
       [   1, 4000],
       [   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000]], dtype=int64)

In [54]:
y=dfle['prices']
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     650050
6     765000
7     790000
8     800000
9     825000
10    557000
11    585000
12    710000
13    780000
14    825000
Name: prices, dtype: int64

In [67]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])],     remainder='passthrough')
# x=np.array(columnTransformer.fit_transform(x),dtype=np.str)
x = np.array(columnTransformer.fit_transform(x), dtype=np.float64)

In [68]:
x

array([[0.0e+00, 1.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 1.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 1.0e+00, 4.0e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03]])

In [69]:
x=x[:,1:]

In [70]:
x

array([[1.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 1.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03]])

In [71]:

reg1=linear_model.LinearRegression()

reg1.fit(x,y)

In [72]:
reg1.predict([[1,0,0,2600.]])

array([587885.13698683])