## Before we go to the modelling part, we need to encode the Categorical Data,
## There are different ways of encoding categorical data, i will list down some of them here

### Import the required libraries

In [1]:
import pandas as pd
import numpy as np

In [98]:
# For this purpose, i am taking the data of Flights

data = pd.read_csv("Flights.csv")
print("The shape of the data is : ", data.shape)
print("The head of the data ")
data.head()


The shape of the data is :  (1052, 16)
The head of the data 


Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2014,1,1,1.0,96.0,235.0,70.0,AS,N508AS,145,PDX,ANC,194.0,1542,0.0,1.0
1,2014,1,1,4.0,-6.0,738.0,-23.0,US,N195UW,1830,SEA,CLT,252.0,2279,0.0,4.0
2,2014,1,1,8.0,13.0,548.0,-4.0,UA,N37422,1609,PDX,IAH,201.0,1825,0.0,8.0
3,2014,1,1,28.0,-2.0,800.0,-23.0,US,N547UW,466,PDX,CLT,251.0,2282,0.0,28.0
4,2014,1,1,34.0,44.0,325.0,43.0,AS,N762AS,121,SEA,ANC,201.0,1448,0.0,34.0


In [6]:
print("The data types of the each feature in the data is : ", data.dtypes)

The data types of the each feature in the data is :  year           int64
month          int64
day            int64
dep_time     float64
dep_delay    float64
arr_time     float64
arr_delay    float64
carrier       object
tailnum       object
flight         int64
origin        object
dest          object
air_time     float64
distance       int64
hour         float64
minute       float64
dtype: object


#### Considering only the object data

In [46]:
obj_data = data.select_dtypes(include = ["object"])
obj_data.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [47]:
# Checking for the nulls in the object data
obj_data.isnull().sum()

carrier    0
tailnum    9
origin     0
dest       0
dtype: int64

In [50]:
# We see that there are 9 nulls in the Tail Number.
# Filling these nulls with the mode of tailnum

tail_mode = obj_data["tailnum"].mode()[0]
obj_data["tailnum"].fillna(tail_mode, inplace = True) 

In [52]:
# Checking for the nulls in the object data after replacing the nulls with the mode of tailnum
obj_data.isnull().sum()

carrier    0
tailnum    0
origin     0
dest       0
dtype: int64

### 1. Encoding Categorical Data using Replace Method


In [99]:
# Replace is one of the most basic methods to encode the categorical data with the desired values.
# Let's encode the origin categorical data 

In [58]:
# Making a local copy of the data frame
obj_data_rep = obj_data.copy()
obj_data_rep.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [59]:
print("The unique values of the origin are : " , obj_data["origin"].unique() )

The unique values of the origin are :  ['PDX' 'SEA']


In [63]:
# Map the categories with the desired values
replace_map = {"origin" : {"PDX" : 0 ,"SEA": 1} }

In [64]:
obj_data_rep.replace(replace_map, inplace = True)
obj_data_rep.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,0,ANC
1,US,N195UW,1,CLT
2,UA,N37422,0,IAH
3,US,N547UW,0,CLT
4,AS,N762AS,1,ANC


In [65]:
# Since the number of unique categories in "origin" feature are only 2, it was easy to set the values.
# But what if, there are lot of unique categories, it will be difficult to set all of them manually.
# Let's take the example of "carrier" feature

In [73]:
obj_data_rep["carrier"].unique()
carrier_cat = obj_data_rep["carrier"].unique().tolist()
print("The unique values of the carrier feature is : ", carrier_cat)

The unique values of the carrier feature is :  ['AS', 'US', 'UA', 'DL', 'AA', 'F9', 'VX', 'OO', 'WN', 'B6', 'HA']


In [74]:
carrier_map = {"carrier" : {value : ind + 1 for ind, value in enumerate(carrier_cat)}}
print("The carrier_map is : ", carrier_map)

The carrier_map is :  {'carrier': {'AS': 1, 'US': 2, 'UA': 3, 'DL': 4, 'AA': 5, 'F9': 6, 'VX': 7, 'OO': 8, 'WN': 9, 'B6': 10, 'HA': 11}}


In [75]:
# Replace the carrier categories
obj_data_rep.replace(carrier_map, inplace = True)
obj_data_rep.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,1,N508AS,0,ANC
1,2,N195UW,1,CLT
2,3,N37422,0,IAH
3,2,N547UW,0,CLT
4,1,N762AS,1,ANC


In [76]:
# Check the data types of the features "carrier" and "dest"
obj_data_rep.dtypes

carrier     int64
tailnum    object
origin      int64
dest       object
dtype: object

### 2. Encoding Categorical Data using  Label Encoding

In [77]:
# The Label Encoding converts each categorical value into a numerical value.
# The range of numerical values lie between 0 and no.of categories-1
# The first unique category is labelled 0 and the next unique category as 2 and there on for each each unique category.

In [81]:
obj_data_lab_enc = obj_data.copy()
obj_data_lab_enc.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [85]:
print("The number of unique categories in tailnum feature is : ", len(obj_data_lab_enc["tailnum"].unique().tolist()))

The number of unique categories in tailnum feature is :  609


In [86]:
# Encoding the tailnum column
obj_data_lab_enc["tailnum"] = obj_data_lab_enc["tailnum"].astype("category")
obj_data_lab_enc["tailnum"] = obj_data_lab_enc["tailnum"].cat.codes
obj_data_lab_enc.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,246,PDX,ANC
1,US,16,SEA,CLT
2,UA,116,PDX,IAH
3,US,288,PDX,CLT
4,AS,469,SEA,ANC


#### 2.1 Label Encoding with sklearn LabelEncoder

In [87]:
from sklearn.preprocessing import LabelEncoder

In [88]:
obj_data_lab_enc_sklearn = obj_data.copy()
obj_data_lab_enc_sklearn.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [89]:
# Initialize the encoder
le = LabelEncoder()

# Fit the Encoder object
obj_data_lab_enc_sklearn["tailnum"] = le.fit_transform(obj_data_lab_enc_sklearn["tailnum"])

# Print the head of the data frame
obj_data_lab_enc_sklearn.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,246,PDX,ANC
1,US,16,SEA,CLT
2,UA,116,PDX,IAH
3,US,288,PDX,CLT
4,AS,469,SEA,ANC


### 3 Encoding a sepcific category to one value and the rest of categories to another value

In [91]:
# When we want to encode a particular category to one value and the rest of categories to another value,
# We can acheieve this using numpy's where ()
# Let's take the example of "carrier" feature.
# We shall encode the carrier from US to 0 and the rest of the carriers to 1

In [93]:
obj_data_part = obj_data.copy()
obj_data_part.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [94]:
obj_data_part["carrier"] = np.where(obj_data_part["carrier"].str.contains('US'),0,1)
obj_data_part.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,1,N508AS,PDX,ANC
1,0,N195UW,SEA,CLT
2,1,N37422,PDX,IAH
3,0,N547UW,PDX,CLT
4,1,N762AS,SEA,ANC


### 4. One Hot Encoding

In [None]:
# The basic strategy of One Hot Encoding is to convert each category value into a new column 
# and assign a 1 or 0(True/False) value to the column.
# This has the benefit of not weighting a value improperly.

In [96]:
obj_data_ohe = obj_data.copy()
obj_data_ohe.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [97]:
obj_data_ohe = pd.get_dummies(obj_data_ohe, columns = ["dest","origin"])
obj_data_ohe.head()

Unnamed: 0,carrier,tailnum,dest_ABQ,dest_ANC,dest_ATL,dest_AUS,dest_BOS,dest_BUR,dest_CLE,dest_CLT,...,dest_SEA,dest_SFO,dest_SJC,dest_SLC,dest_SMF,dest_SNA,dest_STL,dest_TUS,origin_PDX,origin_SEA
0,AS,N508AS,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,US,N195UW,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,UA,N37422,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,US,N547UW,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,AS,N762AS,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### 4.1 One Hot Encoding with sklearn OneHotEncoder

In [100]:
from sklearn.preprocessing import OneHotEncoder

In [101]:
obj_data_sklearn_ohe = obj_data.copy()
obj_data_sklearn_ohe.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [124]:
# Initialize the One Hot Encoder
ohe = OneHotEncoder()

obj_data_sklearn_ohe["carrier"] = obj_data_sklearn_ohe["carrier"].astype("category")

# Apply it on the desired feature
ohe_res = ohe.fit_transform(obj_data_sklearn_ohe[["carrier"]]).toarray()


In [125]:
# Convert the result back to a data frame
ohe_res_df = pd.DataFrame(ohe_res, columns = ohe.categories_)
ohe_res_df.head()

Unnamed: 0,AA,AS,B6,DL,F9,HA,OO,UA,US,VX,WN
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
# Concat the resultant data frame

obj_data_sklearn_ohe_df = pd.concat( [obj_data_sklearn_ohe, ohe_res_df], axis=1)
obj_data_sklearn_ohe_df.head()

Unnamed: 0,carrier,tailnum,origin,dest,"(AA,)","(AS,)","(B6,)","(DL,)","(F9,)","(HA,)","(OO,)","(UA,)","(US,)","(VX,)","(WN,)"
0,AS,N508AS,PDX,ANC,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,US,N195UW,SEA,CLT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,UA,N37422,PDX,IAH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,US,N547UW,PDX,CLT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,AS,N762AS,SEA,ANC,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5. Binary Encoder

In [127]:
# The Binary Encoder encodes the categories to ordinals/numbers first.
# Then these numbers are converted to binary codes.
# Then the digits from the binary codes are split into separate columns.
# In case of Binary Encoder, the number of dimensions are fewer compared to One Hot Encoding

In [128]:
obj_data_be = obj_data.copy()
obj_data_be.head()

Unnamed: 0,carrier,tailnum,origin,dest
0,AS,N508AS,PDX,ANC
1,US,N195UW,SEA,CLT
2,UA,N37422,PDX,IAH
3,US,N547UW,PDX,CLT
4,AS,N762AS,SEA,ANC


In [None]:
import category_encoders as ce

be = ce.BinaryEncoder(cols = ["carrier"])

obj_data_binary = be.fit_transform(obj_data_be)
obj_data_binary.head()