# Data Encoding

- Nominal / One hot encoding
- Label and ordinal encoding
- Target guided ordinal encoding

# 🔹 Nominal / One-Hot Encoding
Each category is converted into a new binary column (1 or 0), suitable for unordered categorical data.

In [1]:
import pandas as pd

# Sample data
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'Purchased': [1, 1, 0, 0, 0]
})


In [2]:
one_hot = pd.get_dummies(df['Color'], prefix='Color')
print(one_hot)


   Color_Blue  Color_Green  Color_Red
0       False        False       True
1        True        False      False
2       False         True      False
3       False        False       True
4       False         True      False


In [3]:
## Using ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder

In [4]:
# instance
one_hot_encode = OneHotEncoder()

In [5]:
one_hot_encode.fit_transform(df[['Color']]).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [7]:
encoded = one_hot_encode.fit_transform(df[['Color']]).toarray()

In [6]:
import pandas as pd

In [8]:
encoded_df = pd.DataFrame(encoded, columns = one_hot_encode.get_feature_names_out())

In [10]:
encoded_df

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [11]:
# for new data

one_hot_encode.transform([['Blue']]).toarray()



array([[1., 0., 0.]])

In [12]:
pd.concat([df, encoded_df],axis = 1)

Unnamed: 0,Color,Size,Purchased,Color_Blue,Color_Green,Color_Red
0,Red,Small,1,0.0,0.0,1.0
1,Blue,Medium,1,1.0,0.0,0.0
2,Green,Large,0,0.0,1.0,0.0
3,Red,Medium,0,0.0,0.0,1.0
4,Green,Small,0,0.0,1.0,0.0


In [None]:
Task 1 : From Seaborn Data Set do one hot encoding on sex or Gender Column

In [13]:
import seaborn as sns

In [15]:
df = sns.load_dataset('tips')

In [28]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
ohe = OneHotEncoder()

In [29]:
gender_ohe = ohe.fit_transform(df[['sex']]).toarray()

In [32]:
data_gen = pd.DataFrame(gender_ohe,columns=ohe.get_feature_names_out())

In [33]:
data_gen

Unnamed: 0,sex_Female,sex_Male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
239,0.0,1.0
240,1.0,0.0
241,0.0,1.0
242,0.0,1.0


In [None]:
Task 2 for you Do for Smoker and day  in same dataset above


# 🔹 Label Encoding
Each unique category is assigned an integer value (e.g., Red = 0, Green = 1, Blue = 2), without implying order.


In [36]:
import pandas as pd

# Sample data
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'Purchased': [1, 1, 0, 0, 0]
})


In [37]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Color_Label'] = le.fit_transform(df['Color'])
print(df[['Color', 'Color_Label']])

   Color  Color_Label
0    Red            2
1   Blue            0
2  Green            1
3    Red            2
4  Green            1


In [40]:
## New Value 
le.transform([['Red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [41]:
le.transform([['Green']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([1])

# 🔹 Ordinal Encoding
Categories are mapped to integers based on a meaningful order (e.g., Low = 1, Medium = 2, High = 3).


In [42]:
# It is like RANKING

In [61]:
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns

import pandas as pd

# Sample data
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'Purchased': [1, 1, 0, 0, 0]
})

In [62]:
# creating Instance 
og = OrdinalEncoder(categories=[['Small','Medium','Large']])

In [63]:
og.fit_transform(df[['Size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.]])

In [64]:
 # Teating new data
og.transform([['Small']])



array([[0.]])

In [65]:
 # Teating new data
og.transform([['Medium']])



array([[1.]])


# 🔹 Target Guided Ordinal Encoding
Each category is replaced with a number based on the target variable’s mean (or another metric), preserving information related to prediction.

In [66]:
import pandas as pd

# Data
data = {
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
    'price': [200, 150, 300, 250, 180, 320]
}

# Create DataFrame
df = pd.DataFrame(data)

In [67]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [68]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [69]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [70]:
df['city_encoded']= df['city'].map(mean_price)

In [71]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0
