In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv(r"E:\Git_repo\One_Hot_Data.csv")
df

Unnamed: 0,City,Area_in_sqft,Price_in_dollars
0,Mumbai,1500,550000
1,Mumbai,2500,9050000
2,Mumbai,2000,9000000
3,Pune,1800,780000
4,Pune,1650,650000
5,Pune,2100,9001005
6,Delhi,2200,9005500
7,Delhi,1200,450000
8,Delhi,1980,980000


In [3]:
#Step1 : Create Dummy Variables Columns
dummies = pd.get_dummies(df.City)      #pd.get_dummies is pandas method
dummies

Unnamed: 0,Delhi,Mumbai,Pune
0,0,1,0
1,0,1,0
2,0,1,0
3,0,0,1
4,0,0,1
5,0,0,1
6,1,0,0
7,1,0,0
8,1,0,0


In [4]:
#Step2 : Concatinate/Append this Dummy Variables into the Original DataSet
mr_data = pd.concat([df,dummies],axis="columns")
mr_data

Unnamed: 0,City,Area_in_sqft,Price_in_dollars,Delhi,Mumbai,Pune
0,Mumbai,1500,550000,0,1,0
1,Mumbai,2500,9050000,0,1,0
2,Mumbai,2000,9000000,0,1,0
3,Pune,1800,780000,0,0,1
4,Pune,1650,650000,0,0,1
5,Pune,2100,9001005,0,0,1
6,Delhi,2200,9005500,1,0,0
7,Delhi,1200,450000,1,0,0
8,Delhi,1980,980000,1,0,0


In [5]:
#Step3 : Drop City Data From the mr_data which is our merged dataset 
final = mr_data.drop(['City'],axis='columns')
#Step4 : Drop One of this Dummy Varaible Coumn because of the Dummy Varaible Trap(I will explain that concept at next blog)
final_data = final.drop(['Pune'],axis='columns')
final_data

Unnamed: 0,Area_in_sqft,Price_in_dollars,Delhi,Mumbai
0,1500,550000,0,1
1,2500,9050000,0,1
2,2000,9000000,0,1
3,1800,780000,0,0
4,1650,650000,0,0
5,2100,9001005,0,0
6,2200,9005500,1,0
7,1200,450000,1,0
8,1980,980000,1,0


In [6]:
#Note : when we are using sklearn linear regression model it will work even if you dont drop it bcz linear regression
#is aware about the trap but its good practies to do this

from sklearn.linear_model import LinearRegression
model=LinearRegression()

#Now give X & Y for traning

X=final_data[['Area_in_sqft','Delhi','Mumbai']]
Y=final_data['Price_in_dollars']

#Now train our model using fit

model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
#Now predict

model.predict([[2600,0,1]])   #['Area_in_sqft',Delhi,Mumbai]

array([11337668.71651687])

In [8]:
#For Pune Just put 0 at Delhi & Mumbai
model.predict([[3100,0,0]])

array([14180478.15940978])

In [9]:
#To check accuracy of the model
model.score(X,Y)

0.6453501530089485

In [10]:
#Now we will use Sklearn One-Hot Encoder which do same thing
df

Unnamed: 0,City,Area_in_sqft,Price_in_dollars
0,Mumbai,1500,550000
1,Mumbai,2500,9050000
2,Mumbai,2000,9000000
3,Pune,1800,780000
4,Pune,1650,650000
5,Pune,2100,9001005
6,Delhi,2200,9005500
7,Delhi,1200,450000
8,Delhi,1980,980000


In [11]:
#To use One-Hot Encoder 
#Step1 : Use Label Encoding at City Column 
from sklearn.preprocessing import LabelEncoder
label_Enc=LabelEncoder()

In [12]:
#now use this model at our original data frame
data_lbl = df
#now change value at the original datafame
data_lbl.City=label_Enc.fit_transform(data_lbl.City) #fit_transform means it take Label col as i/p & it will return the label
data_lbl

Unnamed: 0,City,Area_in_sqft,Price_in_dollars
0,1,1500,550000
1,1,2500,9050000
2,1,2000,9000000
3,2,1800,780000
4,2,1650,650000
5,2,2100,9001005
6,0,2200,9005500
7,0,1200,450000
8,0,1980,980000


In [13]:
X=data_lbl[['City','Area_in_sqft']].values  #.values to convert it into 2D array & not a Dataframe
Y=data_lbl['Price_in_dollars']

In [14]:
#now we have to create dummy varaible col here so we will use sklearn
from sklearn.preprocessing import OneHotEncoder
one_hot=OneHotEncoder(categorical_features=[0])#Always specify categorical_features
#what ever X i am supplying the 0th column in that X is my categorical feature
X=one_hot.fit_transform(X).toarray()
#now to avoid Dummy Variable Trap i am going to drop one column
X=X[:,1:] #Take all the row , Drop 0th column
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.00e+00, 0.00e+00, 1.50e+03],
       [1.00e+00, 0.00e+00, 2.50e+03],
       [1.00e+00, 0.00e+00, 2.00e+03],
       [0.00e+00, 1.00e+00, 1.80e+03],
       [0.00e+00, 1.00e+00, 1.65e+03],
       [0.00e+00, 1.00e+00, 2.10e+03],
       [0.00e+00, 0.00e+00, 2.20e+03],
       [0.00e+00, 0.00e+00, 1.20e+03],
       [0.00e+00, 0.00e+00, 1.98e+03]])

In [15]:
#Now Train 
model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

model.predict([1 , 0 , 2800])

In [16]:
model.predict([[1 , 0 , 2800]])

array([13050224.95535457])

In [17]:
model.predict([[0 , 0 , 3100]])

array([14667200.76041244])

In [18]:
#Now compare the both 