In [1]:
import pandas as pd
import numpy as np

In [2]:
# Generate a random array of 55 elements with values 'm' and 'f'
gender_data = np.random.choice(['m', 'f'], size=55)

print(gender_data)

['m' 'm' 'm' 'm' 'm' 'm' 'm' 'm' 'f' 'f' 'f' 'f' 'f' 'f' 'm' 'f' 'm' 'm'
 'f' 'f' 'm' 'm' 'm' 'f' 'f' 'f' 'm' 'm' 'm' 'f' 'm' 'm' 'm' 'm' 'f' 'f'
 'f' 'f' 'f' 'm' 'm' 'f' 'm' 'f' 'f' 'm' 'm' 'f' 'm' 'f' 'f' 'f' 'm' 'm'
 'm']


In [3]:
df = pd.DataFrame({'gender': gender_data})
df.head()

Unnamed: 0,gender
0,m
1,m
2,m
3,m
4,m


# Label Encoding

Description: Label encoding is the process of converting each category into a unique integer label. This is suitable for ordinal data (data with a clear order).

## Manual Way

In [4]:
label_val = {
    'm': 1,
    'f': 2
}

In [5]:
label_val['m']

1

In [6]:
df['gender'].map(label_val).head()

0    1
1    1
2    1
3    1
4    1
Name: gender, dtype: int64

## Using Sklearn

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
encoder = LabelEncoder()

In [9]:
encoded_data = encoder.fit_transform(df['gender'])
encoded_data

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1])

- Pros: Simple and efficient, works well with ordinal data.
- Cons: For nominal data (no inherent order), this approach can introduce a misleading ordinal relationship (e.g., "1" is less than "2"), which might negatively affect models that assume a numerical order.

# Onehot Encoding

One-hot encoding creates binary columns for each category. Each column represents a unique category and has a 1 in the corresponding column for the observation that belongs to that category, and 0 elsewhere.

## Using Pandas

In [13]:
a = pd.get_dummies(df['gender'], dtype=np.uint8)
a

Unnamed: 0,f,m
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
5,0,1
6,0,1
7,0,1
8,1,0
9,1,0


## Using Sklearn

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
enc = OneHotEncoder(sparse_output=False)
enc = enc.fit(df[['gender']])

In [16]:
enc.categories_

[array(['f', 'm'], dtype=object)]

In [17]:
encoded_arr = enc.transform(df[['gender']])
encoded_df = pd.DataFrame(encoded_arr, columns=enc.categories_, dtype=np.uint8)

In [19]:
encoded_df.head()

Unnamed: 0,f,m
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [20]:
arr = np.array([
    [0, 1],
    [0, 1],
    [1, 0]
])
enc.inverse_transform(arr.reshape(-1, 2))

array([['m'],
       ['m'],
       ['f']], dtype=object)

- Pros: Suitable for nominal data, avoids introducing ordinality.
- Cons: Increases the dimensionality, especially for high-cardinality features (features with many unique categories). It may lead to sparse matrices and result in high memory usage.

# Ordinal Encoding

In [21]:
from sklearn.preprocessing import OrdinalEncoder

In [22]:
encoder = OrdinalEncoder()

In [23]:
data = [['low', 0], ['medium', 1], ['high', 2]]

In [24]:
encoded_data = encoder.fit_transform(data)
encoded_data

array([[1., 0.],
       [2., 1.],
       [0., 2.]])

- Pros: Retains the ordering information, appropriate for ordinal data.
- Cons: Like label encoding, it is inappropriate for nominal data, as it imposes an artificial order.

# Hash Encoding

In [25]:
import hashlib
def hash_category(x, num_buckets=10):
    return int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16) % num_buckets

In [33]:
int(hashlib.md5("A".encode('utf-8')).hexdigest(), 16) 

169836834567204038179966570894283554345

In [34]:
data = pd.DataFrame({'category': ['A', 'B', 'A', 'B', 'A']})
data['encoded'] = data['category'].apply(hash_category)
print(data)

  category  encoded
0        A        5
1        B        3
2        A        5
3        B        3
4        A        5
