## Handling Categorical features

### One hot encoding

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('titanic_train.csv',usecols=['Sex'])

In [4]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [6]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [7]:
df = pd.read_csv('titanic_train.csv',usecols=['Embarked'])

In [8]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [9]:
df.dropna(inplace=True)

In [11]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### Ordinal number Encoding

In [12]:
import datetime

In [13]:
today_date = datetime.datetime.today()

In [16]:
days = [today_date-datetime.timedelta(x) for x in range(0,15)]

In [18]:
import pandas as pd
df = pd.DataFrame(days)
df.columns=['Day']

In [19]:
df.head(15)

Unnamed: 0,Day
0,2021-08-20 10:37:59.788941
1,2021-08-19 10:37:59.788941
2,2021-08-18 10:37:59.788941
3,2021-08-17 10:37:59.788941
4,2021-08-16 10:37:59.788941
5,2021-08-15 10:37:59.788941
6,2021-08-14 10:37:59.788941
7,2021-08-13 10:37:59.788941
8,2021-08-12 10:37:59.788941
9,2021-08-11 10:37:59.788941


In [28]:
df['Weekday'] = df['Day'].dt.day_name()

In [29]:
df.head()

Unnamed: 0,Day,Weekday
0,2021-08-20 10:37:59.788941,Friday
1,2021-08-19 10:37:59.788941,Thursday
2,2021-08-18 10:37:59.788941,Wednesday
3,2021-08-17 10:37:59.788941,Tuesday
4,2021-08-16 10:37:59.788941,Monday


In [30]:
dictionary={
    "Monday":1,
    "Tuesday":2,
    "Wednesday":3,
    "Thursday":4,
    "Friday":5,
    "Saturday":6,
    "Sunday":7
}

In [31]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [34]:
df['Weekday_ordinal'] = df['Weekday'].map(dictionary)

In [35]:
df.head(10)

Unnamed: 0,Day,Weekday,Weekday_ordinal
0,2021-08-20 10:37:59.788941,Friday,5
1,2021-08-19 10:37:59.788941,Thursday,4
2,2021-08-18 10:37:59.788941,Wednesday,3
3,2021-08-17 10:37:59.788941,Tuesday,2
4,2021-08-16 10:37:59.788941,Monday,1
5,2021-08-15 10:37:59.788941,Sunday,7
6,2021-08-14 10:37:59.788941,Saturday,6
7,2021-08-13 10:37:59.788941,Friday,5
8,2021-08-12 10:37:59.788941,Thursday,4
9,2021-08-11 10:37:59.788941,Wednesday,3


### Count or Frequency Encoding

In [60]:
train_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None) 
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [61]:
columns = [1,3,5,6,7,8,9,13]

In [62]:
train_set = train_set[columns]

In [63]:
train_set.columns = ['Employment','Degree','Status','Designation','Family_pos','Race','Sex','Country']

In [64]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_pos,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [65]:
for feature in train_set.columns[:]:
    print(feature,":",len(train_set[feature].unique()),"labels")

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family_pos : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [66]:
train_set['Country'].value_counts()

 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [67]:
country_map = train_set['Country'].value_counts().to_dict()

In [68]:
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [69]:
train_set['Country'] = train_set['Country'].map(country_map)
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family_pos,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
