# Handling Categorical Features

### One Hot Encoding

In [2]:
import pandas as pd


df=pd.read_csv('titanic.csv',usecols=['Sex'])


In [3]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [5]:
df=pd.read_csv('titanic.csv',usecols=['Embarked'])


In [6]:
df['Embarked'].unique()


array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df.dropna(inplace=True)


In [8]:
pd.get_dummies(df).head()


Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [9]:
pd.get_dummies(df,drop_first=True).head()


Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


#### Onehotencoding with many categories in a feature


In [10]:
df=pd.read_csv('mercedes.csv')

In [11]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df=pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])


In [13]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [14]:
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


In [15]:
df.X1.value_counts().sort_values(ascending=False).head(10)


aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [16]:
lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10=list(lst_10)

In [17]:
lst_10


['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [18]:
import numpy as np
for categories in lst_10:
    df[categories]=np.where(df['X1']==categories,1,0)

In [19]:
lst_10.append('X1')


In [20]:
df[lst_10]


Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r


# Ordinal Number Encoding

In [21]:
import datetime

In [22]:
today_date=datetime.datetime.today()

In [23]:
today_date

datetime.datetime(2021, 11, 25, 21, 1, 0, 598935)

In [24]:
today_date-datetime.timedelta(3)


datetime.datetime(2021, 11, 22, 21, 1, 0, 598935)

In [25]:
#### List Comprehension

days=[today_date-datetime.timedelta(x) for x in range(0,15)]

In [26]:
days

[datetime.datetime(2021, 11, 25, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 24, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 23, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 22, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 21, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 20, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 19, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 18, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 17, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 16, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 15, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 14, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 13, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 12, 21, 1, 0, 598935),
 datetime.datetime(2021, 11, 11, 21, 1, 0, 598935)]

In [27]:
import pandas as pd

data=pd.DataFrame(days)

data.columns=["Day"]


In [28]:
data.head()

Unnamed: 0,Day
0,2021-11-25 21:01:00.598935
1,2021-11-24 21:01:00.598935
2,2021-11-23 21:01:00.598935
3,2021-11-22 21:01:00.598935
4,2021-11-21 21:01:00.598935


In [30]:
data['weekday']=data['Day'].dt.weekday_name
data.head()

AttributeError: 'DatetimeProperties' object has no attribute 'weekday_name'

In [31]:
dictionary={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7
    
    
}

In [32]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [33]:
data['weekday'].map(dictionary)


KeyError: 'weekday'

In [34]:
data

Unnamed: 0,Day
0,2021-11-25 21:01:00.598935
1,2021-11-24 21:01:00.598935
2,2021-11-23 21:01:00.598935
3,2021-11-22 21:01:00.598935
4,2021-11-21 21:01:00.598935
5,2021-11-20 21:01:00.598935
6,2021-11-19 21:01:00.598935
7,2021-11-18 21:01:00.598935
8,2021-11-17 21:01:00.598935
9,2021-11-16 21:01:00.598935


# Count Or Frequency Encoding


In [35]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [36]:
columns=[1,3,5,6,7,8,9,13]


In [37]:
train_set=train_set[columns]


In [38]:
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']


In [39]:
train_set.head()


Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [40]:
for feature in train_set.columns[:]:
    print(feature,":",len(train_set[feature].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [41]:
country_map=train_set['Country'].value_counts().to_dict()


In [42]:
train_set['Country']=train_set['Country'].map(country_map)
train_set.head(20)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


Advantages

Easy To Use
Not increasing feature space 

##### Disadvantages
It will provide same weight if the frequencies are same



# Target Guided Ordinal Encoding

Ordering the labels according to the target



Replace the labels by the joint probability of being 1 or 0

In [43]:
import pandas as pd


df=pd.read_csv('titanic.csv', usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [44]:
df['Cabin'].fillna('Missing',inplace=True)


In [46]:
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [47]:
df['Cabin']=df['Cabin'].astype(str).str[0]

In [48]:
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
5,0,M
6,0,E
7,0,M
8,1,M
9,1,M


In [49]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [50]:
df.groupby(['Cabin'])['Survived'].mean()


Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [51]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index


Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [52]:
df.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [53]:
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index

ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [54]:
enumerate(ordinal_labels,0)


<enumerate at 0x2b027b25640>

In [55]:
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)}

ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [56]:
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)


df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


# Mean Encoding

In [57]:
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()


In [58]:
mean_ordinal


{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [59]:
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854
