In [3]:
import pandas as pd
# pandas is aliased as pd
import numpy as np
# numpy is aliased as np
import matplotlib.pyplot as plt
# pyplot is aliased as plt

In [4]:
df = pd.read_csv('50_Startups.csv')
df.head() # top 5 rows

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
df.shape
# num of rows=50
# num of cols=5

(50, 5)

In [6]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
df.dtypes
# State is a Categorical column

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [8]:
# How many unique values are present in the State column
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [9]:
df['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

### Methods of Encoding Categorical Data

In [10]:
df1 = df.copy()
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


### 1) replace() or map()
In both these functions we pass dict where the categories to be replaced are the keys and the values 
are the integers with which the categories will be encoded with.

In case we have more than 50 or 100 categories, then this method is not a favorable option for encoding

In [11]:
df['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [12]:
df1['State_using_rep'] = df1['State'].replace({'New York':0,'California':1,'Florida':2})
df1['State_using_rep'].value_counts()

0    17
1    17
2    16
Name: State_using_rep, dtype: int64

In [13]:
df1['State_using_map'] = df1['State'].map({'New York':0,'California':1,'Florida':2})
df1['State_using_map'].value_counts()

0    17
1    17
2    16
Name: State_using_map, dtype: int64

In [14]:
df1.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
State_using_rep      int64
State_using_map      int64
dtype: object

In [15]:
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_using_rep,State_using_map
0,165349.2,136897.8,471784.1,New York,192261.83,0,0
1,162597.7,151377.59,443898.53,California,191792.06,1,1
2,153441.51,101145.55,407934.54,Florida,191050.39,2,2
3,144372.41,118671.85,383199.62,New York,182901.99,0,0
4,142107.34,91391.77,366168.42,Florida,166187.94,2,2


In [16]:
df.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

### 2) LabelEncoder()

In [17]:
df2 = df.copy()
df2.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [18]:
df2['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
lb = LabelEncoder()
df2['State'] = lb.fit_transform(df2['State'])

In [21]:
df2['State'].value_counts()

2    17
0    17
1    16
Name: State, dtype: int64

In [22]:
df2.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State                int32
Profit             float64
dtype: object

In [23]:
df2.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


### 3) get_dummies()
1) For each category multiple columns will be created(the number of columns created will be same as number of categories in that column). <br>
2) Each row will contain only single occurence of 1, all other values will be 0 for those newly created columns.<br>
3) It returns a dataframe<br>
<pre>
Prod  Prod_A   Prod_B  Prod_C  Prod_D
A       1        0       0       0      
B       0        1       0       0
D       0        0       0       1
A       1        0       0       0
C       0        0       1       0
B       0        1       0       0 
A       1        0       0       0 
</pre>

In [24]:
df3 = df.copy()
df3.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [25]:
df_gd = pd.get_dummies(data=df3,columns=['State'])
df_gd.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [26]:
df_gd.dtypes

R&D Spend           float64
Administration      float64
Marketing Spend     float64
Profit              float64
State_California      uint8
State_Florida         uint8
State_New York        uint8
dtype: object

### 4) OneHot Encoder
1) Same as pd.get_dummies()<br>
2) The only difference, the end result generated is a numpy array instead of dataframe<br>
3) It generates float data type

In [27]:
df4 = df.copy()

In [28]:
df4.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [29]:
df4.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [30]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder()

In [31]:
res = ohc.fit_transform(df4[['State']]).toarray()
res

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0

In [32]:
df.head()
# California  Florida   New York

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [33]:
states = pd.DataFrame(res,columns=['California','Florida','New York'])
states.head()

Unnamed: 0,California,Florida,New York
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [34]:
df_ohc = pd.concat([df4,states],axis=1)
df_ohc.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,California,Florida,New York
0,165349.2,136897.8,471784.1,New York,192261.83,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,California,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,New York,182901.99,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,Florida,166187.94,0.0,1.0,0.0


In [36]:
df_ohc.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
California         float64
Florida            float64
New York           float64
dtype: object