In [None]:
#Categories are ordered based on their relationship with the target variable.
# If categories influence the target and have no natural order → use Target-Guided Ordinal Encoding

In [2]:
import pandas as pd

# Create a sample DataFrame
# 'city' is a categorical (nominal) feature with no natural order
# 'price' is a numerical target variable
# We will later use 'price' to learn an order for the 'city' column
# using Target-Guided Ordinal Encoding
df = pd.DataFrame({
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
    'price': [200, 150, 300, 250, 180, 320]
})


In [3]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [5]:
# Group the DataFrame by the 'city' column
# For each city, calculate the mean (average) of the target variable 'price'
# This helps us understand how each city relates to the target
# The resulting values will be used to define an order for cities
# in Target-Guided Ordinal Encoding
df.groupby('city')['price'].mean()

Unnamed: 0_level_0,price
city,Unnamed: 1_level_1
London,150.0
New York,190.0
Paris,310.0
Tokyo,250.0


In [8]:
# Group the data by 'city' and calculate the mean of the target variable 'price'
# Convert the resulting pandas Series into a Python dictionary
# The dictionary will store the average price for each city
# This mapping is the foundation for target-guided ordinal encoding
mean_price = df.groupby('city')['price'].mean().to_dict()

# Display the dictionary
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [12]:
# Map each city in the 'city' column to its corresponding
# mean price value using the 'mean_price' dictionary
# This replaces each city name with the average target value
# calculated earlier (target-guided encoding)
df['city_encoded'] = df['city'].map(mean_price)

In [13]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [18]:
# Select and display only the 'price' column (target variable)
# and the 'city_encoded' column (encoded categorical feature)
# This is useful to verify how city encoding relates to the target
df[['price', 'city_encoded']]
#These two tables can be directly used to train the model, and city column can be ignored, as it is replaced by city_encoded column
#Done .... !

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0


In [19]:
#Trying myself... !

In [20]:
#Import the dataset
import seaborn as sns
df=sns.load_dataset('tips')

In [21]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [22]:
#Let's try to change the time on basis of total_bill column

In [24]:
df.groupby('time')['total_bill'].mean()

  df.groupby('time')['total_bill'].mean()


Unnamed: 0_level_0,total_bill
time,Unnamed: 1_level_1
Lunch,17.168676
Dinner,20.797159


In [25]:
mean_price = df.groupby('time')['total_bill'].mean().to_dict()

  mean_price = df.groupby('time')['total_bill'].mean().to_dict()


In [26]:
mean_price

{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [29]:
df['time_encoded'] = df['time'].map(mean_price)

In [30]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,20.797159
