In [57]:
# DataEncoding >> Converting categorical column into nmerical 
# Type - 1. Nominal ohe, 2. Label and ordinal encoding, 3. Target guided ordinal encoding

## [1]. Nominal ohe (one hot encoder)

In [58]:
# Single, Married, Relation
# Single = [1,0,0]
# Married = [0,1,0]
# Relation = [0,0,1]

In [59]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [60]:

df = pd.DataFrame({'status':['single','married','relation','single','married','relation']})

In [61]:
df

Unnamed: 0,status
0,single
1,married
2,relation
3,single
4,married
5,relation


In [62]:
encoder = OneHotEncoder()

In [63]:
encoder

In [64]:
encoder.fit(df[['status']])

In [65]:
encoded = encoder.fit_transform(df[['status']]).toarray() # Storing in encoded

In [66]:
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [67]:
Column = encoder.get_feature_names_out() # Storing in Column

In [68]:
Column

array(['status_married', 'status_relation', 'status_single'], dtype=object)

#### Making dataframe

In [69]:
encoder_df = pd.DataFrame(encoded,columns = Column)

#### Fit >> mue and Sigma
#### Transform >> Calculation

In [70]:
# New Data >> single
encoder.transform([['single']]).toarray()



array([[0., 0., 1.]])

In [71]:
encoder.transform([['married']]).toarray()



array([[1., 0., 0.]])

In [72]:
df

Unnamed: 0,status
0,single
1,married
2,relation
3,single
4,married
5,relation


### Merging data

In [73]:
pd.concat([df,encoder_df], axis = 1)

Unnamed: 0,status,status_married,status_relation,status_single
0,single,0.0,0.0,1.0
1,married,1.0,0.0,0.0
2,relation,0.0,1.0,0.0
3,single,0.0,0.0,1.0
4,married,1.0,0.0,0.0
5,relation,0.0,1.0,0.0


### Making final_df

In [74]:
finaldf = pd.concat([df,encoder_df], axis = 1)

In [75]:
finaldf.drop(['status','status_married'], axis = 1, inplace = True)

In [76]:
finaldf

Unnamed: 0,status_relation,status_single
0,0.0,1.0
1,0.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,0.0
5,1.0,0.0


### Making dummydata using pandas

In [77]:
pd.get_dummies(df['status'])

Unnamed: 0,married,relation,single
0,False,False,True
1,True,False,False
2,False,True,False
3,False,False,True
4,True,False,False
5,False,True,False



### 2. Making encoder using tips dataset

In [78]:
import seaborn as sns

In [79]:
df2 = sns.load_dataset('tips')

In [80]:
df2

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### Selecting 'Day' column in tips Dataset

In [81]:
encoded2 = encoder.fit_transform(df2[['day']]).toarray()

In [82]:
column1 = encoder.get_feature_names_out()

In [83]:
column1

array(['day_Fri', 'day_Sat', 'day_Sun', 'day_Thur'], dtype=object)

In [84]:
encoder_df2 = pd.DataFrame(encoded2, columns = column1)

In [85]:
encoder_df2

Unnamed: 0,day_Fri,day_Sat,day_Sun,day_Thur
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,0.0,0.0
240,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,0.0
242,0.0,1.0,0.0,0.0


## [2]. Ordinal Encoding

In Ordinal encoding we assign numerical label for each category

In [86]:
from sklearn.preprocessing import OrdinalEncoder

In [87]:
# ordinal encoding
# high school>> 1
# graduation >> 2
# undergraduate >> 3
# PostGraduate >> 4

In [88]:
from sklearn.preprocessing import OrdinalEncoder

In [89]:
df2 = pd.DataFrame({'Qualification':['HS','GR','PG','HS','PHD','PG','HS']})

In [90]:
OrdinalEncoder(categories=[["HS","GR","PG","PHD"]])

In [91]:
encoder.fit(df2[['Qualification']])

In [92]:
df2.Qualification.value_counts()

Qualification
HS     3
PG     2
GR     1
PHD    1
Name: count, dtype: int64

## [3]. Target Guided ordinal Encoding

In [93]:
# Relation with target Variable
# A lot of unique Categories
# Replace with mean and median

In [94]:
df_target = pd.DataFrame({'time': ['lunch', 'breakfast', 'dinner','lunch','breakfast','dinner','lunch','breakfast','dinner'],'total_bill':[120,120,40,150,160,130,40,150,160]})

In [95]:
df_target.groupby('time')['total_bill'].mean()

time
breakfast    143.333333
dinner       110.000000
lunch        103.333333
Name: total_bill, dtype: float64

In [96]:
# Converting to dict
mean_price = df_target.groupby('time')['total_bill'].mean().to_dict()


In [97]:
mean_price

{'breakfast': 143.33333333333334, 'dinner': 110.0, 'lunch': 103.33333333333333}

#### Making new colun and merging in dataframe

In [98]:
df_target['time_target_encoded'] = df_target['time'].map(mean_price)

In [99]:
df_target

Unnamed: 0,time,total_bill,time_target_encoded
0,lunch,120,103.333333
1,breakfast,120,143.333333
2,dinner,40,110.0
3,lunch,150,103.333333
4,breakfast,160,143.333333
5,dinner,130,110.0
6,lunch,40,103.333333
7,breakfast,150,143.333333
8,dinner,160,110.0
