In [4]:
# What is Data Encoding?
# ML algorithms work with numbers only.
# Encoding = converting categorical/text data → numerical format.

In [3]:
# Types of Data
# Numerical Data – already numbers (e.g., age, salary)
# Categorical Data – non-numeric
# Nominal: no order (e.g., color: red, blue)
# Ordinal: has order (e.g., education: high school < bachelor < master < PhD)

In [2]:
# Nominal Data & One-Hot Encoding (OHE)
# Nominal Data
# Categorical data without order.
# Examples: Color (Red, Blue, Green), Fruit (Apple, Mango).

# One-Hot Encoding (OHE)
# Converts each category → binary vector (0 or 1).
# Each category gets its own column.
# Ensures ML algorithms don’t assume order.

In [1]:
# Tip: Avoid Label Encoding for nominal data → ML may assume a false order.

In [None]:
#Disadvantages of OHE:

# 1) High Dimensionality: Creates one column per category → many features for high-cardinality data.
# 2) Sparsity: Most entries are 0, leading to sparse matrices → inefficient storage & computation.
# 3) Not suitable for ordinal data: OHE does not capture order between categories.
# 4) Curse of Dimensionality:  Too many features can slow down models and increase risk of overfitting.

In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [7]:
#Creating a simple data frame
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'red', 'blue']
})

In [8]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [10]:
#Creating an instance of OHE
encoder = OneHotEncoder()
# encoder is a variable that stores this instance, so we can use it to fit and transform data later.

In [18]:
# Fit the encoder on the 'color' column and transform it into one-hot format
encoded = encoder.fit_transform(df[['color']]).toarray()

In [19]:
encoded #Red -> 0,0,1 (alphabetically done)
#Blue -> 1,0,0

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [23]:
import pandas as pd

# Convert the one-hot encoded array into a pandas DataFrame
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

In [24]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [28]:
pd.concat([df,encoder_df],axis=1)  #See the original and encoded side by side
#And we are done now

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


In [34]:
#Now trying this on the sns preloaded dataset
import seaborn as sns
df = sns.load_dataset('tips')

In [35]:
#We will convert the sex column
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['sex']]).toarray()

In [36]:
encoded

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [37]:
import pandas as pd
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

In [38]:
encoder_df

Unnamed: 0,sex_Female,sex_Male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
239,0.0,1.0
240,1.0,0.0
241,0.0,1.0
242,0.0,1.0


In [39]:
pd.concat([df,encoder_df],axis=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0
