# Imports

In [16]:
import pandas as pd
import numpy as np

# Topics 

## prepare the DataFrame 

In [17]:
data = {
    'Date': ['03-25', '03-26', '03-27', '03-28', '03-29', '03-30', '03-31', '04-01', '04-02', '04-03', '04-04', '04-05'],
    'Weekday': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri'],
    'Month': ['Mar', 'Mar', 'Mar', 'Mar', 'Mar', 'Mar', 'Mar', 'Apr', 'Apr', 'Apr', 'Apr', 'Apr'],
    'Temperature': ['High', 'Low', 'High', 'Extreme', 'Low', 'High', 'High', 'Low', 'High', 'Extreme', 'High', 'Low'],
    'Humidity': ['Dry', 'Humid', 'Dry', 'Dry', 'Humid', 'Humid', 'Dry', 'Humid', 'Dry', 'Dry', 'Humid', 'Dry'],
    'Wind': ['No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'Outlook': ['sunny', 'rainy', 'overcast', 'sunny', 'rainy', 'overcast', 'sunny', 'rainy', 'sunny', 'overcast', 'sunny', 'rainy'],
    'Crowdedness': [85, 30, 65, 45, 25, 90, 95, 35, 70, 50, 80, 45]
}
# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

## 1. Label Encoding
It's often used for **ordinal variables** where there's a clear order to the categories, such as education levels (e.g., primary, secondary, tertiary) or product ratings (e.g., 1 star, 2 stars, 3 stars).

We're applying it to the weekdays. 
It doesn't help much to think that Sunday is greater than Monday


Why Not Hot Encoding? 
7 columns -> 7 days (overhead)

In [18]:
df['Weekday_Label'] = pd.factorize(df['Weekday'])[0] #factorize encodes as enum object 
df.head()

Unnamed: 0,Date,Weekday,Month,Temperature,Humidity,Wind,Outlook,Crowdedness,Weekday_Label
0,03-25,Mon,Mar,High,Dry,No,sunny,85,0
1,03-26,Tue,Mar,Low,Humid,Yes,rainy,30,1
2,03-27,Wed,Mar,High,Dry,Yes,overcast,65,2
3,03-28,Thu,Mar,Extreme,Dry,Yes,sunny,45,3
4,03-29,Fri,Mar,Low,Humid,No,rainy,25,4


## 2. One Hot Encoding
One-Hot Encoding creates a new binary column for each category in a categorical variable.

It's typically used for **nominal variables** where there's no inherent order to the categories. It's particularly useful when dealing with variables that have a **relatively small number of categories** (up to 4)

In [19]:
df = pd.get_dummies(df, columns=['Outlook'] , dtype=int)
df.head() 
# usually we drop one of the columns to avoid multicollinearity

Unnamed: 0,Date,Weekday,Month,Temperature,Humidity,Wind,Crowdedness,Weekday_Label,Outlook_overcast,Outlook_rainy,Outlook_sunny
0,03-25,Mon,Mar,High,Dry,No,85,0,0,0,1
1,03-26,Tue,Mar,Low,Humid,Yes,30,1,0,1,0
2,03-27,Wed,Mar,High,Dry,Yes,65,2,1,0,0
3,03-28,Thu,Mar,Extreme,Dry,Yes,45,3,0,0,1
4,03-29,Fri,Mar,Low,Humid,No,25,4,0,1,0


## 3. Binary Encoding

It's often used when there are only two categories, mostly in a yes-no situation.

In [20]:
# df['Wind'] = df['Wind'].map({'No': 0, 'Yes': 1}) # option 1 
# df['Wind'] = pd.factorize(df['Wind'])[0] # option 2 
df['Wind_binary'] =(df['Wind'] == 'Yes').astype(int) # option 3
df.head()

Unnamed: 0,Date,Weekday,Month,Temperature,Humidity,Wind,Crowdedness,Weekday_Label,Outlook_overcast,Outlook_rainy,Outlook_sunny,Wind_binary
0,03-25,Mon,Mar,High,Dry,No,85,0,0,0,1,0
1,03-26,Tue,Mar,Low,Humid,Yes,30,1,0,1,0,1
2,03-27,Wed,Mar,High,Dry,Yes,65,2,1,0,0,1
3,03-28,Thu,Mar,Extreme,Dry,Yes,45,3,0,0,1,1
4,03-29,Fri,Mar,Low,Humid,No,25,4,0,1,0,0


## 4. Target Encoding

Target Encoding replaces each category with the mean of the target variable for that category.

It's used when there's likely a relationship between the categorical variable and the target variable. It's particularly useful for high-cardinality features in datasets with a reasonable number of rows.

we use upon the humdiity column

In [21]:
df.groupby('Humidity')['Crowdedness'].mean()

Humidity
Dry      65.0
Humid    52.0
Name: Crowdedness, dtype: float64

In [22]:
df.groupby('Humidity')['Crowdedness'].transform('mean')

0     65.0
1     52.0
2     65.0
3     65.0
4     52.0
5     52.0
6     65.0
7     52.0
8     65.0
9     65.0
10    52.0
11    65.0
Name: Crowdedness, dtype: float64

In [23]:
df['Humidity_Target'] = df.groupby('Humidity')['Crowdedness'].transform('mean')
df.head()

Unnamed: 0,Date,Weekday,Month,Temperature,Humidity,Wind,Crowdedness,Weekday_Label,Outlook_overcast,Outlook_rainy,Outlook_sunny,Wind_binary,Humidity_Target
0,03-25,Mon,Mar,High,Dry,No,85,0,0,0,1,0,65.0
1,03-26,Tue,Mar,Low,Humid,Yes,30,1,0,1,0,1,52.0
2,03-27,Wed,Mar,High,Dry,Yes,65,2,1,0,0,1,65.0
3,03-28,Thu,Mar,Extreme,Dry,Yes,45,3,0,0,1,1,65.0
4,03-29,Fri,Mar,Low,Humid,No,25,4,0,1,0,0,52.0


## 5. Ordinal Encoding

Ordinal Encoding assigns ordered integers to ordinal categories based on their inherent order.

It's used for ordinal variables where the order of categories is meaningful and you want to preserve this order information.

Ex : Temperature Variable 

In [24]:
temp_order = {'Low': 1, 'High': 2, 'Extreme': 3}
df['Temperature_ordinal'] = df['Temperature'].map(temp_order)
df.head()

Unnamed: 0,Date,Weekday,Month,Temperature,Humidity,Wind,Crowdedness,Weekday_Label,Outlook_overcast,Outlook_rainy,Outlook_sunny,Wind_binary,Humidity_Target,Temperature_ordinal
0,03-25,Mon,Mar,High,Dry,No,85,0,0,0,1,0,65.0,2
1,03-26,Tue,Mar,Low,Humid,Yes,30,1,0,1,0,1,52.0,1
2,03-27,Wed,Mar,High,Dry,Yes,65,2,1,0,0,1,65.0,2
3,03-28,Thu,Mar,Extreme,Dry,Yes,45,3,0,0,1,1,65.0,3
4,03-29,Fri,Mar,Low,Humid,No,25,4,0,1,0,0,52.0,1


## 6. Cyclic Encoding 
Cyclic Encoding transforms a cyclical categorical variable into two numerical features that preserve the variable's cyclical nature

It's used for categorical variables that have a natural cyclical order, such as days of the week, months of the year, or hours of the day. Cyclic encoding is particularly useful when the "distance" between categories matters and wraps around (e.g., the distance between December and January should be small, just like the distance between any other consecutive months)

In [25]:
month_order = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
               'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
df['Month_num'] = df['Month'].map(month_order) # 1- First , Convert to Numerical 
df['Month_sin'] = np.sin(2 * np.pi * (df['Month_num']-1) / 12) # then use sine 
df['Month_cos'] = np.cos(2 * np.pi * (df['Month_num']-1) / 12) # and cosine

In [26]:
days_order = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
df['Cyclic_days'] = df['Weekday'].map(days_order)
df['Days_sin'] = np.sin(2 * np.pi * (df['Cyclic_days']-1) / 7)
df['Days_cos'] = np.cos(2 * np.pi * (df['Cyclic_days']-1) / 7)
df.head()

Unnamed: 0,Date,Weekday,Month,Temperature,Humidity,Wind,Crowdedness,Weekday_Label,Outlook_overcast,Outlook_rainy,Outlook_sunny,Wind_binary,Humidity_Target,Temperature_ordinal,Month_num,Month_sin,Month_cos,Cyclic_days,Days_sin,Days_cos
0,03-25,Mon,Mar,High,Dry,No,85,0,0,0,1,0,65.0,2,3,0.866025,0.5,1,0.0,1.0
1,03-26,Tue,Mar,Low,Humid,Yes,30,1,0,1,0,1,52.0,1,3,0.866025,0.5,2,0.781831,0.62349
2,03-27,Wed,Mar,High,Dry,Yes,65,2,1,0,0,1,65.0,2,3,0.866025,0.5,3,0.974928,-0.222521
3,03-28,Thu,Mar,Extreme,Dry,Yes,45,3,0,0,1,1,65.0,3,3,0.866025,0.5,4,0.433884,-0.900969
4,03-29,Fri,Mar,Low,Humid,No,25,4,0,1,0,0,52.0,1,3,0.866025,0.5,5,-0.433884,-0.900969


In [27]:
df = df.loc[:,"Crowdedness":]

df.drop(['Outlook_sunny'],inplace=True,axis=1)
df.head(10)

Unnamed: 0,Crowdedness,Weekday_Label,Outlook_overcast,Outlook_rainy,Wind_binary,Humidity_Target,Temperature_ordinal,Month_num,Month_sin,Month_cos,Cyclic_days,Days_sin,Days_cos
0,85,0,0,0,0,65.0,2,3,0.866025,0.5,1,0.0,1.0
1,30,1,0,1,1,52.0,1,3,0.866025,0.5,2,0.781831,0.62349
2,65,2,1,0,1,65.0,2,3,0.866025,0.5,3,0.974928,-0.222521
3,45,3,0,0,1,65.0,3,3,0.866025,0.5,4,0.433884,-0.900969
4,25,4,0,1,0,52.0,1,3,0.866025,0.5,5,-0.433884,-0.900969
5,90,5,1,0,0,52.0,2,3,0.866025,0.5,6,-0.974928,-0.222521
6,95,6,0,0,1,65.0,2,3,0.866025,0.5,7,-0.781831,0.62349
7,35,0,0,1,0,52.0,1,4,1.0,6.123234000000001e-17,1,0.0,1.0
8,70,1,0,0,1,65.0,2,4,1.0,6.123234000000001e-17,2,0.781831,0.62349
9,50,2,1,0,1,65.0,3,4,1.0,6.123234000000001e-17,3,0.974928,-0.222521


As we wrap up our encoding discussion, let's highlight some critical points to keep in mind:

- Information Loss: Some encoding methods can lead to loss of information. For example, label encoding might impose an unintended ordinal relationship.

<br>

- The New Category Issue: Most encoding techniques stumble when faced with categories in your test data that weren't present during training. Always have a strategy for handling these unexpected guests.

<br> 

- Curse of Dimensionality: Techniques like one-hot encoding can dramatically increase the number of features (imagine if you have hundreds different categories like countries or cities!). You might want to select the features that actually matters to encode (like categorizing the rare ones as "Others").

<br>

- Document, Document, Document: Your future self (and your colleagues) will thank you for clearly recording your encoding decisions. This transparency is for reproducibility and for understanding any potential biases in your results.