In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# get dummies 
df = pd.DataFrame({'education': ['High School', 'Bachelors', 'Masters', 'PhD']})
dummies = pd.get_dummies(df['education'], dtype=int)
print(dummies)

   Bachelors  High School  Masters  PhD
0          0            1        0    0
1          1            0        0    0
2          0            0        1    0
3          0            0        0    1


In [3]:
# custom mapping
df = pd.DataFrame({'education': ['High School', 'Bachelors', 'Masters', 'PhD']})
mapping = {'High School': 1, 'Bachelors': 2, 'Masters': 3, 'PhD': 4}
df['new_col'] = df['education'].map(mapping)
print(df)

     education  new_col
0  High School        1
1    Bachelors        2
2      Masters        3
3          PhD        4


In [4]:
# label encoder

data = {'Height' : ['long', 'mediam', 'short', 'long', 'long', 'short', 'short', 'short']}

df = pd.DataFrame(data)
df

Unnamed: 0,Height
0,long
1,mediam
2,short
3,long
4,long
5,short
6,short
7,short


In [5]:
label_encoder = LabelEncoder()

In [6]:
df['new_height'] = label_encoder.fit_transform(df['Height'])
print(label_encoder.classes_)

['long' 'mediam' 'short']


In [7]:
df # 0 : Long, 1 : mediam, 2 : short

Unnamed: 0,Height,new_height
0,long,0
1,mediam,1
2,short,2
3,long,0
4,long,0
5,short,2
6,short,2
7,short,2


In [8]:
# one hot encoding
df = pd.DataFrame({'Fruits' : ['apple', 'mango', 'apple', 'orange', 'apple', 'mango']})
df

Unnamed: 0,Fruits
0,apple
1,mango
2,apple
3,orange
4,apple
5,mango


In [9]:
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[['Fruits']])
print(one_hot_encoded)

[[1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [10]:
df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(['Fruits']))
print(encoder.categories_)
print(df)

[array(['apple', 'mango', 'orange'], dtype=object)]
   Fruits_apple  Fruits_mango  Fruits_orange
0           1.0           0.0            0.0
1           0.0           1.0            0.0
2           1.0           0.0            0.0
3           0.0           0.0            1.0
4           1.0           0.0            0.0
5           0.0           1.0            0.0


In [11]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit([['red'], ['blue']])
print(encoder.transform([['green']]))  #'green' wasn't seen in fit()

[[0. 0.]]


In [12]:
# frequency encoding

df = pd.DataFrame({'city': ['Delhi', 'Mumbai', 'Delhi', 'Chennai', 'Mumbai', 'Delhi', 'Mumbai']})
freq_map = df['city'].value_counts().to_dict()
print(freq_map)

{'Delhi': 3, 'Mumbai': 3, 'Chennai': 1}


In [13]:
df['Encoded_city'] = df['city'].map(freq_map) # it only give one column, unlike one hot encoding 
print(df)

      city  Encoded_city
0    Delhi             3
1   Mumbai             3
2    Delhi             3
3  Chennai             1
4   Mumbai             3
5    Delhi             3
6   Mumbai             3


In [14]:
import category_encoders as ce

In [15]:
df = pd.DataFrame({'city': ['Delhi', 'Mumbai', 'Delhi', 'Chennai', 'Bangalore']})

In [16]:
encoder = ce.HashingEncoder(cols=['city'], n_components=4)

In [17]:
df_encoded = encoder.fit_transform(df)
print(df_encoded)

   col_0  col_1  col_2  col_3
0      0      0      1      0
1      0      1      0      0
2      0      0      1      0
3      0      0      1      0
4      1      0      0      0


In [18]:
encoder = ce.HashingEncoder(n_components=4)
encoder.fit([['red'], ['blue']])
print(encoder.transform([['green'], ['black']]))

   col_0  col_1  col_2  col_3
0      1      0      0      0
1      0      1      0      0
