## Data Encoding – Quick Note

Data encoding converts categorical variables into a format that machine learning models can understand (usually numeric).

1. Label Encoding

Assigns a unique integer to each category.

Suitable for ordinal data (where order matters).

2. One-Hot Encoding

Creates a new binary column for each category.

Suitable for nominal data (no order).

3. Ordinal Encoding

Similar to Label Encoding but follows a defined order.

In [2]:
import numpy as np
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [10]:
encoder=OneHotEncoder()
df = pd.DataFrame({'Color': ['Green', 'Blue', 'Red','Red', 'Blue', 'Green', 'Blue', 'Red','Green', 'Blue', 'Red']})
encoder.fit_transform(df[['Color']]).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [11]:
encoder.get_feature_names_out()

array(['Color_Blue', 'Color_Green', 'Color_Red'], dtype=object)

In [18]:
encoder.transform([['Blue']]).toarray(), encoder.transform([['Red']]).toarray(), encoder.transform([['Green']]).toarray()



(array([[1., 0., 0.]]), array([[0., 0., 1.]]), array([[0., 1., 0.]]))

In [13]:
encoded_data=encoder.fit_transform(df[['Color']]).toarray()

encoded_df=pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out())

encoded_df

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0
6,1.0,0.0,0.0
7,0.0,0.0,1.0
8,0.0,1.0,0.0
9,1.0,0.0,0.0


# Encoding Techniques

1. Label Encoding
   - Use for ordinal data (natural order).
   - Example: Small=0, Medium=1, Large=2

2. One-Hot Encoding
   - Use for nominal data (no order).
   - Creates binary columns: Red=[1,0,0], Green=[0,1,0], Blue=[0,0,1]

3. Ordinal Encoding
   - Like label encoding but with defined ranking.
   - Example: High School=0, Bachelor's=1, Master's=2, PhD=3

4. Frequency Encoding
   - Replaces category with its frequency/count.
   - Good for high-cardinality features.

5. Target / Mean Encoding
   - Replace category with mean of target value for that category.
   - Use in supervised learning, avoid data leakage.


In [6]:
from sklearn.preprocessing import LabelEncoder

lbe_encoder=LabelEncoder()
df = pd.DataFrame({'Color': ['Green', 'Blue', 'Red','Red', 'Blue', 'Green', 'Blue', 'Red','Green', 'Blue', 'Red']})
lbe_encoder.fit_transform(df)

  y = column_or_1d(y, warn=True)


array([1, 0, 2, 2, 0, 1, 0, 2, 1, 0, 2])

In [8]:
lbe_encoder.transform([['Red']]),lbe_encoder.transform([['Blue']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


(array([2]), array([0]))

In [14]:
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'Name': [f'Person_{i}' for i in range(1, 21)],
    'Education_Level': [
        'High School', 'Bachelor', 'Master', 'PhD', 'Bachelor',
        'Master', 'High School', 'Bachelor', 'PhD', 'Master',
        'Bachelor', 'PhD', 'High School', 'Master', 'PhD',
        'Bachelor', 'High School', 'Master', 'PhD', 'Bachelor'
    ]
})

df = pd.DataFrame(data)
print(df)

         Name Education_Level
0    Person_1     High School
1    Person_2        Bachelor
2    Person_3          Master
3    Person_4             PhD
4    Person_5        Bachelor
5    Person_6          Master
6    Person_7     High School
7    Person_8        Bachelor
8    Person_9             PhD
9   Person_10          Master
10  Person_11        Bachelor
11  Person_12             PhD
12  Person_13     High School
13  Person_14          Master
14  Person_15             PhD
15  Person_16        Bachelor
16  Person_17     High School
17  Person_18          Master
18  Person_19             PhD
19  Person_20        Bachelor


In [15]:
### ordinal encoding 
from sklearn.preprocessing import OrdinalEncoder

ord_encoder=OrdinalEncoder(categories=[['High School', 'Bachelor', 'Master', 'PhD']])

ord_encoder.fit_transform(data[['Education_Level']])

array([[0.],
       [1.],
       [2.],
       [3.],
       [1.],
       [2.],
       [0.],
       [1.],
       [3.],
       [2.],
       [1.],
       [3.],
       [0.],
       [2.],
       [3.],
       [1.],
       [0.],
       [2.],
       [3.],
       [1.]])

In [17]:
ord_encoder.transform([['PhD']])



array([[3.]])

In [31]:
### target encoding 

np.random.seed(42)
categories = ['A', 'B', 'C', 'D', 'E']
data = pd.DataFrame({
    'Category': np.random.choice(categories, 50),
    'Feature1': np.random.randint(10, 100, 50),
    'Feature2': np.random.randn(50) * 10 + 50,
    'Target': np.random.randint(100, 500, 50) 
})
data.head()

Unnamed: 0,Category,Feature1,Feature2,Target
0,D,69,60.309995,414
1,E,80,59.312801,473
2,C,53,41.607825,259
3,E,17,46.907876,195
4,E,56,53.312634,332


In [32]:
data.groupby('Category')['Target'].mean().to_dict()

{'A': 310.2857142857143,
 'B': 291.6,
 'C': 325.1,
 'D': 262.46153846153845,
 'E': 350.3}

In [34]:
encoded_data=data.groupby('Category')['Target'].mean().to_dict()

data['Category_encoded']=data['Category'].map(encoded_data)

data[['Category_encoded','Feature1','Feature2','Target']]

Unnamed: 0,Category_encoded,Feature1,Feature2,Target
0,262.461538,69,60.309995,414
1,350.3,80,59.312801,473
2,325.1,53,41.607825,259
3,350.3,17,46.907876,195
4,350.3,56,53.312634,332
5,291.6,44,59.755451,279
6,325.1,87,45.208258,212
7,325.1,90,48.14341,417
8,325.1,45,38.93665,151
9,350.3,59,38.037934,367


## Case Study 

In [35]:
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [42]:
from sklearn.preprocessing import OneHotEncoder
oh_encoder=OneHotEncoder()

sex_encoded=oh_encoder.fit_transform(df[['sex']]).toarray()

sex_encoded=pd.DataFrame(sex_encoded,columns=oh_encoder.get_feature_names_out())
df=pd.concat([df,sex_encoded],axis=1)

In [43]:
oh_encoder=OneHotEncoder()

smoker_encoded=oh_encoder.fit_transform(df[['smoker']]).toarray()

smoker_encoded=pd.DataFrame(smoker_encoded,columns=oh_encoder.get_feature_names_out())
df=pd.concat([df,smoker_encoded],axis=1)
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,1.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0,0.0,1.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,1.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,1.0,0.0


In [47]:
df['time_encoded']=df['time'].map(df.groupby('time')['total_bill'].mean().to_dict())
df

  df['time_encoded']=df['time'].map(df.groupby('time')['total_bill'].mean().to_dict())


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,20.797159
...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,1.0,0.0,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0,0.0,1.0,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,1.0,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,1.0,0.0,20.797159


In [52]:
df['day_encoded']=df['day'].map(df.groupby('day')['total_bill'].mean().to_dict())
df=df[['total_bill','tip','size','sex_Female','sex_Male','smoker_No','smoker_Yes','time_encoded','day_encoded']]

df

  df['day_encoded']=df['day'].map(df.groupby('day')['total_bill'].mean().to_dict())


Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,time_encoded,day_encoded
0,16.99,1.01,2,1.0,0.0,1.0,0.0,20.797159,21.410000
1,10.34,1.66,3,0.0,1.0,1.0,0.0,20.797159,21.410000
2,21.01,3.50,3,0.0,1.0,1.0,0.0,20.797159,21.410000
3,23.68,3.31,2,0.0,1.0,1.0,0.0,20.797159,21.410000
4,24.59,3.61,4,1.0,0.0,1.0,0.0,20.797159,21.410000
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0.0,1.0,1.0,0.0,20.797159,20.441379
240,27.18,2.00,2,1.0,0.0,0.0,1.0,20.797159,20.441379
241,22.67,2.00,2,0.0,1.0,0.0,1.0,20.797159,20.441379
242,17.82,1.75,2,0.0,1.0,1.0,0.0,20.797159,20.441379
