In [38]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [39]:
df = pd.read_csv("realEstateRates_multipleCities.csv")
df

Unnamed: 0,town,area,price
0,Nagpur,2500,550000
1,Nagpur,3000,570000
2,Nagpur,3200,620000
3,Nagpur,3800,690000
4,Nagpur,4200,730000
5,Indore,2400,570000
6,Indore,2800,600000
7,Indore,3000,650000
8,Indore,3600,740000
9,Pune,2600,690000


## Categorical variables
The towns are given as text (i.e names). The text cannot be used in model. Model requires numerics. Once model is built it can be used for prediction.

Such variables are called Categorial variables. 
Categorical variables are divided into two types:
    1. Nominal variables --> here categories does not have any numeric ordering. No ordering relation exists
    2. Ordinal variables --> here categories  have some sort of  numeric ordering

Nominal variables examples
     1. The names of city (or town) belongs to Nominal variables
     2. gender as male or female belongs to Nominal variables
     3. color as red, green, blue

Ordinal variables examples
    1. degree in college --> graduate, masters, phd
    2. customer feedback --> satisfied, neutral, dissatisfied
    3. scores --> high, medium, low

## Using oneHotEncoder of scikit
1. import LabelEncoder from sklearn
2. do label encoding on town column using LabelEncoder 
2. copy dataframe to another dataframe 
3. fit & transform the town column 

In [40]:
# 1.import LabelEncoder 
from sklearn.preprocessing import LabelEncoder

# 2.create labelEncoder object
labelEnc = LabelEncoder()

# 3.copy dataframe
df_labelEnc = df

# 4.fit the model , its called fit & transform.
# It assigns integers to town names
labelEnc.fit_transform(df_labelEnc.town)

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2])

In [41]:
#replace city_name by the transformed column to the dataframe
df_labelEnc.town = labelEnc.fit_transform(df_labelEnc.town)
df_labelEnc

Unnamed: 0,town,area,price
0,1,2500,550000
1,1,3000,570000
2,1,3200,620000
3,1,3800,690000
4,1,4200,730000
5,0,2400,570000
6,0,2800,600000
7,0,3000,650000
8,0,3600,740000
9,2,2600,690000


## SKK:
    - There is some issue with categorical_features parameter with OneHotEncoder(). Hence X is taken as 1D array
    - X2 is separately formed from dataframe , which is area column
    - X2 is appended (concat) to X using numpy to create X_final


In [42]:
#create 1D array (note its not dataframe )
X = df_labelEnc[['town']].values
X

array([[1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]])

In [43]:
#create y 
y_price = df_labelEnc.price
y_price

0      550000
1      570000
2      620000
3      690000
4      730000
5      570000
6      600000
7      650000
8      740000
9      690000
10     800000
11     900000
12    1100000
Name: price, dtype: int64

In [44]:
#import oneHotEncoder from sklearn
from sklearn.preprocessing import  OneHotEncoder

#OneHotEncoder() takes categorical_features as parameter. 
oneHot_Enc = OneHotEncoder(categories='auto')   #to remove warnings

In [45]:
#fit_transform() creates three dummy variables
x_city = oneHot_Enc.fit_transform(X).toarray()
x_city

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [46]:
x_area = df_labelEnc[['area']].values
x_area

array([[2500],
       [3000],
       [3200],
       [3800],
       [4200],
       [2400],
       [2800],
       [3000],
       [3600],
       [2600],
       [3200],
       [3600],
       [4000]], dtype=int64)

In [47]:
#append area column 
X_final = np.concatenate((x_city, x_area), axis=1)
X_final

array([[0.0e+00, 1.0e+00, 0.0e+00, 2.5e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.8e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.4e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.8e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.0e+03]])


## dummy variable trap and its handling
    Drop one of the dummy variable column also. 
    This is to avoid dummy variable trap (and avoid messing machine learning model). 
    As one of variable can be derived from rest of variables (i.e 3rd can be derived from 1st and 2nd)

In [48]:
#To avoid dummy variable trap, drop one of the column
X_final = X_final[:, 1:]
X_final

array([[1.0e+00, 0.0e+00, 2.5e+03],
       [1.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 3.8e+03],
       [1.0e+00, 0.0e+00, 4.2e+03],
       [0.0e+00, 0.0e+00, 2.4e+03],
       [0.0e+00, 0.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 4.0e+03]])

In [49]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_final,y)

# to check accuracy of your model
model.score(X_final,y_price)

0.9072810389654646

In [50]:
#predict price for area for a particular city 
#Lets predict for 5200 and for Pune city ---- so for  area =5200, Nagpur = 0, pune = 1
x_input = [0, 1,5200]
model.predict([x_input])

array([1187542.91251605])

In [51]:
#Lets predict for 4800 and for Indore city 
#so for  area =4800, nagpur = 0, pune = 0
x_input = [ 0, 0, 4800]
model.predict([x_input])

array([955042.91251561])

In [52]:
#Lets predict for 5300 and for Nagpur city 
#so for  area =5300, Nagpur =1, pune = 0
x_input = [1, 0, 5300]
model.predict([x_input])

array([965775.19380077])