## Data Encoding

### 1.Nominal/OHE Encoding


In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [38]:
df=pd.DataFrame({
    'color':['red','blue','red','green','green']
})

In [22]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,red
3,green
4,green


In [23]:
encoder=OneHotEncoder()
encoded=encoder.fit_transform(df[['color']]).toarray()

In [24]:
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [26]:
encoder_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [27]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0


In [28]:
## For new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [31]:
pd.concat([df,encoder_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,red,0.0,0.0,1.0
3,green,0.0,1.0,0.0
4,green,0.0,1.0,0.0


In [2]:
import seaborn as sns

In [11]:
tips=sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
categorical_cols=['sex','smoker','day','time']

In [24]:
encoder=OneHotEncoder(drop='first',sparse_output=False)
## drop='first' → Drops the first category in each column to avoid dummy variable trap (removes redundancy).
## sparse=False → Ensures output is a dense NumPy array instead of a sparse matrix.
encoded_data = encoder.fit_transform(tips[categorical_cols])

In [25]:
encoded_data

array([[0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       ...,
       [1., 1., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], shape=(244, 6))

In [None]:
encoded_data_df=pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out())
## friday is dropped due to drop=first

In [27]:
encoded_data_df

Unnamed: 0,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
239,1.0,0.0,1.0,0.0,0.0,0.0
240,0.0,1.0,1.0,0.0,0.0,0.0
241,1.0,1.0,1.0,0.0,0.0,0.0
242,1.0,0.0,1.0,0.0,0.0,0.0


In [32]:
tips_final=pd.concat([tips.drop(columns=categorical_cols),encoded_data_df],axis=1)

In [33]:
tips_final

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,2,0.0,0.0,0.0,1.0,0.0,0.0
1,10.34,1.66,3,1.0,0.0,0.0,1.0,0.0,0.0
2,21.01,3.50,3,1.0,0.0,0.0,1.0,0.0,0.0
3,23.68,3.31,2,1.0,0.0,0.0,1.0,0.0,0.0
4,24.59,3.61,4,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1.0,0.0,1.0,0.0,0.0,0.0
240,27.18,2.00,2,0.0,1.0,1.0,0.0,0.0,0.0
241,22.67,2.00,2,1.0,1.0,1.0,0.0,0.0,0.0
242,17.82,1.75,2,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
## Alternate Way
# We can also use "pd.get_dummies"

### Label Encoding
assigning a unique numerical value to each category in the variable.



In [None]:
# example:
# 1.Red: 1
# 2.Green: 2
# 3.Blue: 3

In [39]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,red
3,green
4,green


In [40]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder=LabelEncoder()

In [41]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 2, 1, 1])

In [42]:
lbl_encoder.transform([['green']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([1])

### Ordinal Encoding
It is used to encode categorical data that have an intrinsic order or ranking

In [None]:
## situation where we need to assign ranks

In [43]:
from sklearn.preprocessing import OrdinalEncoder


In [44]:
df=pd.DataFrame({
    'size':['small','medium','large','medium','small','large']
})

In [45]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [48]:
ord_encoder=OrdinalEncoder(categories=[['small','medium','large']])

In [52]:
encoded_size=ord_encoder.fit_transform(df[['size']])
encoded_size

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [61]:
ord_encoder.transform([['large']])



array([[2.]])

In [58]:
size_df=pd.DataFrame(encoded_size,columns=['size_En'])
size_df

Unnamed: 0,size_En
0,0.0
1,1.0
2,2.0
3,1.0
4,0.0
5,2.0


In [59]:
pd.concat([df,size_df],axis=1)

Unnamed: 0,size,size_En
0,small,0.0
1,medium,1.0
2,large,2.0
3,medium,1.0
4,small,0.0
5,large,2.0


### Target Guided Ordinal Encoding
It is the technique used to encode categorical variables base on their relationship with the target variable

In [1]:
import pandas as pd

In [2]:
df=pd.DataFrame({
    'city':['New York','Paris','London','New York','Paris'],
    'price':[200,190,340,250,320]
})

In [3]:
df

Unnamed: 0,city,price
0,New York,200
1,Paris,190
2,London,340
3,New York,250
4,Paris,320


In [6]:
mean_price=df.groupby('city')['price'].mean().to_dict()

In [7]:
mean_price

{'London': 340.0, 'New York': 225.0, 'Paris': 255.0}

In [8]:
df['city_encoded']=df['city'].map(mean_price)

In [9]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,225.0
1,Paris,190,255.0
2,London,340,340.0
3,New York,250,225.0
4,Paris,320,255.0


In [10]:
df[['price','city_encoded']]

Unnamed: 0,price,city_encoded
0,200,225.0
1,190,255.0
2,340,340.0
3,250,225.0
4,320,255.0


In [None]:
import pandas as pd

# Sample dataset
data = pd.DataFrame({
    'City': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'C', 'B'],
    'Price': [300, 500, 200, 400, 600, 250, 450, 275, 550]
})

print("Original Data:")
print(data)

# Step 1: Compute mean target (Price) for each category (City)
mean_price = data.groupby('City')['Price'].mean().sort_values()

# Step 2: Create ordinal mapping
ordinal_mapping = {city: rank+1 for rank, city in enumerate(mean_price.index)}

print("\nOrdinal Mapping:", ordinal_mapping)

# Step 3: Replace categorical values with ordinal encoding
data['City_Encoded'] = data['City'].map(ordinal_mapping)
d
print("\nTransformed Data:")
print(data)


Original Data:
  City  Price
0    A    300
1    B    500
2    C    200
3    A    400
4    B    600
5    C    250
6    A    450
7    C    275
8    B    550

Ordinal Mapping: {'C': 1, 'A': 2, 'B': 3}

Transformed Data:
  City  Price  City_Encoded
0    A    300             2
1    B    500             3
2    C    200             1
3    A    400             2
4    B    600             3
5    C    250             1
6    A    450             2
7    C    275             1
8    B    550             3


In [11]:
import seaborn as sns

In [13]:
tips=sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
df=pd.concat([tips['total_bill'],tips['time']],axis=1)
df

In [25]:
mean_bill=df.groupby('time')['total_bill'].mean().sort_values()
mean_bill

  mean_bill=df.groupby('time')['total_bill'].mean().sort_values()


time
Lunch     17.168676
Dinner    20.797159
Name: total_bill, dtype: float64

In [28]:
mapping={time:rank+1 for rank,time in enumerate(mean_bill.index)}
mapping

{'Lunch': 1, 'Dinner': 2}

In [29]:
df['time_encoded']=df['time'].map(mapping)

In [30]:
df

Unnamed: 0,total_bill,time,time_encoded
0,16.99,Dinner,2
1,10.34,Dinner,2
2,21.01,Dinner,2
3,23.68,Dinner,2
4,24.59,Dinner,2
...,...,...,...
239,29.03,Dinner,2
240,27.18,Dinner,2
241,22.67,Dinner,2
242,17.82,Dinner,2


In [35]:
tips.drop(['total_bill','time'],axis=1,inplace=True)

In [36]:
tips

Unnamed: 0,tip,sex,smoker,day,size
0,1.01,Female,No,Sun,2
1,1.66,Male,No,Sun,3
2,3.50,Male,No,Sun,3
3,3.31,Male,No,Sun,2
4,3.61,Female,No,Sun,4
...,...,...,...,...,...
239,5.92,Male,No,Sat,3
240,2.00,Female,Yes,Sat,2
241,2.00,Male,Yes,Sat,2
242,1.75,Male,No,Sat,2


In [39]:
tips_modified=pd.concat([tips,df],axis=1)
tips_modified

Unnamed: 0,tip,sex,smoker,day,size,total_bill,time,time_encoded
0,1.01,Female,No,Sun,2,16.99,Dinner,2
1,1.66,Male,No,Sun,3,10.34,Dinner,2
2,3.50,Male,No,Sun,3,21.01,Dinner,2
3,3.31,Male,No,Sun,2,23.68,Dinner,2
4,3.61,Female,No,Sun,4,24.59,Dinner,2
...,...,...,...,...,...,...,...,...
239,5.92,Male,No,Sat,3,29.03,Dinner,2
240,2.00,Female,Yes,Sat,2,27.18,Dinner,2
241,2.00,Male,Yes,Sat,2,22.67,Dinner,2
242,1.75,Male,No,Sat,2,17.82,Dinner,2
