# Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_csv('tips.csv')
print("Original 'day' column:\n",df['day'].head())

Original 'day' column:
 0    Sun
1    Sun
2    Sun
3    Sun
4    Sun
Name: day, dtype: object


### One-Hot Encoding (OHE)

drop=None means keep all columns

sparse_output=False neabs Compress columns (i.e. store only 1s, not 0s)

In [5]:
ohe = OneHotEncoder(sparse_output=False, drop=None)    
ohe_array = ohe.fit_transform(df[['day']])

In [7]:
#Column names created by OneHotEncoder
ohe_cols = ohe.get_feature_names_out(['day'])
#Create new dataframe with OneHotEncoder columns
df_ohe = df.join(pd.DataFrame(ohe_array, columns=ohe_cols, index=df.index))

print("\nOne-Hot Encoded 'day':\n", df_ohe[ohe_cols].head())
print("\nOne-Hot Encoded 'day':\n", df_ohe[ohe_cols].tail())


One-Hot Encoded 'day':
    day_Fri  day_Sat  day_Sun  day_Thur
0      0.0      0.0      1.0       0.0
1      0.0      0.0      1.0       0.0
2      0.0      0.0      1.0       0.0
3      0.0      0.0      1.0       0.0
4      0.0      0.0      1.0       0.0

One-Hot Encoded 'day':
      day_Fri  day_Sat  day_Sun  day_Thur
239      0.0      1.0      0.0       0.0
240      0.0      1.0      0.0       0.0
241      0.0      1.0      0.0       0.0
242      0.0      1.0      0.0       0.0
243      0.0      0.0      0.0       1.0


#### Drop one redundant category
drop='first' drops one category

In [11]:
ohe_drop = OneHotEncoder(sparse_output=False, drop='first')
ohe_array2 = ohe_drop.fit_transform(df[['day']])
ohe_cols2 = ohe_drop.get_feature_names_out(['day'])

df_ohe2 = df.join(pd.DataFrame(ohe_array2, columns=ohe_cols2, index=df.index))

print("\nOne-Hot Encoded 'day':\n", df_ohe2[ohe_cols2].head())
print("\nOne-Hot Encoded 'day':\n", df_ohe2[ohe_cols2].tail())


One-Hot Encoded 'day':
    day_Sat  day_Sun  day_Thur
0      0.0      1.0       0.0
1      0.0      1.0       0.0
2      0.0      1.0       0.0
3      0.0      1.0       0.0
4      0.0      1.0       0.0

One-Hot Encoded 'day':
      day_Sat  day_Sun  day_Thur
239      1.0      0.0       0.0
240      1.0      0.0       0.0
241      1.0      0.0       0.0
242      1.0      0.0       0.0
243      0.0      0.0       1.0


# Label Encoding

In [12]:
label_encoder = LabelEncoder()
df['day_label'] = label_encoder.fit_transform(df['day'])

print("\nEncoded 'day':\n", df[['day','day_label']].head())
print("\nEncoded 'day':\n", df[['day','day_label']].tail())


Encoded 'day':
    day  day_label
0  Sun          2
1  Sun          2
2  Sun          2
3  Sun          2
4  Sun          2

Encoded 'day':
       day  day_label
239   Sat          1
240   Sat          1
241   Sat          1
242   Sat          1
243  Thur          3


# Frequency Encoding
normalize=True means use % instead

In [13]:
day_freq = df['day'].value_counts(normalize=False)
df['day_freq'] = df['day'].map(day_freq)

print("\nFrequency encoded 'day':\n", df[['day','day_freq']].head())
print("\nFrequency encoded 'day':\n", df[['day','day_freq']].tail())


Frequency encoded 'day':
    day  day_freq
0  Sun        76
1  Sun        76
2  Sun        76
3  Sun        76
4  Sun        76

Frequency encoded 'day':
       day  day_freq
239   Sat        87
240   Sat        87
241   Sat        87
242   Sat        87
243  Thur        62
