# OneHotEncoding

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('homeprices_onehotencode.csv')

In [4]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   town    13 non-null     object
 1   area    13 non-null     int64 
 2   price   13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 440.0+ bytes


### here we have town names, we have to convert them in numbers so we can use machine learning model

In [6]:
df.town.value_counts()

monroe township    5
west windsor       4
robinsville        4
Name: town, dtype: int64

In [7]:
df.town.unique() ,df.town.nunique()

(array(['monroe township', 'west windsor', 'robinsville'], dtype=object), 3)

### here we have 3 towns as 'monroe township', 'west windsor', 'robinsville', so we have to give dummy variables

### these are nominal catogorical data as we cant compare town which is greater or not, so we cant number them like 1,2,3
### we have to number them in term of 0, 1 that we can do in onehotencoding

### first we will try with pandas method to get dummy vaiables

In [9]:
pd.get_dummies(df.town)

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


#### it converts our town column into three columns with 0,1 assignment to each town

In [10]:
dummies = pd.get_dummies(df.town)

### we have to add these column to our data set

In [11]:
merge = pd.concat([df,dummies],axis=1)

In [12]:
merge

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


#### here now we dont need town column and we will drop one duumy variable column to avoid dummy vaiable trap

In [13]:
final = merge.drop(['town','west windsor'],axis=1)

In [14]:
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


#### we have all numeric data so we can use machine learning madel
#### here we can use LinearRegression
#### we are treating all tows at equal level so we can predict prices from area
#### we take town dummy columns also in our input so we can check for which town what will price 

In [15]:
X = final.drop(['price'],axis=1)

In [16]:
y = final.price

In [17]:
model = LinearRegression()

In [19]:
model.fit(X,y)

LinearRegression()

In [20]:
model.score(X,y)*100

95.73929037221873

### Our model is 95% accurate

### lets prdict some price

In [21]:
model.predict([[3000,0,1]])



array([616155.12792948])

#### here we predicted price for area 3000 from robinsville town

#### let we predict price from west windsor town of area 3200

In [76]:
model.predict([[3200,0,1]])



array([641534.61621157])

#### compared with our given data our predicted values are good

## OneHotEncoding

### let we try same dat set with sklearns OneHotEncoding

In [23]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


#### before using onehotencoder we have to lable encoding

In [41]:
from sklearn.preprocessing import LabelEncoder

In [42]:
lab = LabelEncoder()

In [43]:
df1 = df

In [44]:
lab.fit_transform(df1.town)

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1], dtype=int64)

#### here we assigned value for 'monroe township', 'west windsor', 'robinsville' as 0,2,1 resp.

#### let we assign these values to towns from our main data set

In [45]:
df1.town = lab.fit_transform(df1.town)

In [29]:
df1

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


#### here we have X in array form, not data frame

In [46]:
from sklearn.preprocessing import OneHotEncoder

In [47]:
ohe = OneHotEncoder()

In [48]:
df2 = pd.DataFrame(ohe.fit_transform(df[['town']]).toarray(),columns=['monroe township', 'west windsor', 'robinsville'])
df2

Unnamed: 0,monroe township,west windsor,robinsville
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0
6,0.0,0.0,1.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,1.0,0.0


In [49]:
final = pd.concat([df1,df2],axis=1)

In [50]:
final

Unnamed: 0,town,area,price,monroe township,west windsor,robinsville
0,0,2600,550000,1.0,0.0,0.0
1,0,3000,565000,1.0,0.0,0.0
2,0,3200,610000,1.0,0.0,0.0
3,0,3600,680000,1.0,0.0,0.0
4,0,4000,725000,1.0,0.0,0.0
5,2,2600,585000,0.0,0.0,1.0
6,2,2800,615000,0.0,0.0,1.0
7,2,3300,650000,0.0,0.0,1.0
8,2,3600,710000,0.0,0.0,1.0
9,1,2600,575000,0.0,1.0,0.0


#### for avoiding dummy vaiable trap we drop robinsville trap and here we dont need town column

In [54]:
final = final.drop(['town','robinsville'],axis=1)

In [55]:
final

Unnamed: 0,area,price,monroe township,west windsor
0,2600,550000,1.0,0.0
1,3000,565000,1.0,0.0
2,3200,610000,1.0,0.0
3,3600,680000,1.0,0.0
4,4000,725000,1.0,0.0
5,2600,585000,0.0,0.0
6,2800,615000,0.0,0.0
7,3300,650000,0.0,0.0
8,3600,710000,0.0,0.0
9,2600,575000,0.0,1.0


#### so here we have our final data set has all numerical values

#### so here we can use linear regression model

In [56]:
model = LinearRegression()

In [62]:
final.columns

Index(['area', 'price', 'monroe township', 'west windsor'], dtype='object')

In [65]:
X = final.drop(['price'],axis=1)

In [67]:
y = final.price

In [68]:
model.fit(X,y)

LinearRegression()

In [70]:
model.score(X,y)*100

95.73929037221873

#### let we predict price from west windsor town of area 4300

In [75]:
model.predict([[3200,0,1]])



array([641534.61621157])

#### so her we get same result from pandas get_dummies and one hot encoding