In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [26]:
ds = pd.read_csv('train.csv',usecols=['Sex', 'Embarked', 'Cabin',"Survived"])

In [27]:
ds.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,,S
1,1,female,C85,C
2,1,female,,S
3,1,female,C123,S
4,0,male,,S


In [28]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    ds[['Sex', 'Embarked', 'Cabin']],  
    ds['Survived'],  
    test_size=0.3,  
    random_state=10)  

In [8]:
# Lets check some cardinality of training set

In [29]:
X_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [30]:
X_train['Embarked'].unique()


array(['S', 'Q', 'C', nan], dtype=object)

In [31]:
X_train['Cabin'].unique()

array([nan, 'D11', 'T', 'F4', 'D50', 'D26', 'B35', 'E67', 'B101', 'C68',
       'C23 C25 C27', 'B57 B59 B63 B66', 'E77', 'D10 D12', 'E24', 'D48',
       'C148', 'A5', 'C118', 'C93', 'E101', 'C2', 'C110', 'D56', 'B19',
       'D35', 'B49', 'C101', 'C90', 'B41', 'B77', 'B82 B84', 'C82',
       'C124', 'D21', 'B58 B60', 'D17', 'D47', 'C46', 'C78', 'D36', 'E33',
       'E25', 'C126', 'B73', 'B28', 'E121', 'C123', 'C86', 'B96 B98',
       'B86', 'C103', 'G6', 'E63', 'B79', 'D45', 'C111', 'B5', 'D',
       'B51 B53 B55', 'D28', 'F33', 'F2', 'C91', 'A6', 'E8', 'B42', 'C70',
       'D30', 'E12', 'C125', 'C65', 'D7', 'C99', 'D19', 'F G63',
       'C22 C26', 'C54', 'A36', 'B69', 'C52', 'B102', 'E34', 'A16', 'E44',
       'E17', 'F38', 'B94', 'B22', 'D49', 'A10', 'C50', 'D9', 'B71',
       'F E69', 'A31', 'E40', 'D37', 'B20', 'D46', 'D20', 'A32', 'C83',
       'B18', 'E58', 'D33', 'F G73', 'E49', 'A24', 'A23', 'C92', 'B30',
       'C95'], dtype=object)

### One hot encoding to K dummies

In [32]:
tmp = pd.get_dummies(X_train['Sex'])
tmp.head()

Unnamed: 0,female,male
7,0,1
765,1,0
339,0,1
374,1,0
183,0,1


In [33]:
tmp = pd.get_dummies(X_train['Embarked'])
tmp.head()

Unnamed: 0,C,Q,S
7,0,0,1
765,0,0,1
339,0,0,1
374,0,0,1
183,0,0,1


In [34]:
tmp = pd.get_dummies(X_train['Cabin'])
tmp.head()

Unnamed: 0,A10,A16,A23,A24,A31,A32,A36,A5,A6,B101,...,E8,F E69,F G63,F G73,F2,F33,F38,F4,G6,T
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
765,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
374,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [35]:
# One-hot encoding for all the categorical variables at once for train set

tmp = pd.get_dummies(X_train)
print(tmp.shape)
tmp.head()

(623, 117)


Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Cabin_A10,Cabin_A16,Cabin_A23,Cabin_A24,Cabin_A31,...,Cabin_E8,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T
7,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
765,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
339,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
374,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [36]:
# One-hot encoding for all the categorical variables at once for test set

tmp = pd.get_dummies(X_test)
print(tmp.shape)
tmp.head()

(268, 62)


Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Cabin_A14,Cabin_A19,Cabin_A20,Cabin_A26,Cabin_A34,...,Cabin_E31,Cabin_E36,Cabin_E38,Cabin_E44,Cabin_E46,Cabin_E50,Cabin_E68,Cabin_F G73,Cabin_F2,Cabin_F33
590,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
628,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
195,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can see that, number of columns of train set and test set is not equal. This is because some of the features may not be present in test set.

## K-1 Dummies

In [38]:
tmp = pd.get_dummies(X_train['Sex'], drop_first=True)
tmp.head()

Unnamed: 0,male
7,1
765,0
339,1
374,0
183,1


In [39]:
tmp = pd.get_dummies(X_train['Embarked'], drop_first=True)
tmp.head()

Unnamed: 0,Q,S
7,0,1
765,0,1
339,0,1
374,0,1
183,0,1


In [40]:
# All in one go for train set
tmp = pd.get_dummies(X_train, drop_first=True)
print(tmp.shape)
tmp.head()

(623, 114)


Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Cabin_A16,Cabin_A23,Cabin_A24,Cabin_A31,Cabin_A32,Cabin_A36,Cabin_A5,...,Cabin_E8,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T
7,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
765,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
339,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
374,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [42]:
# All in one go for test
tmp = pd.get_dummies(X_test, drop_first=True)
print(tmp.shape)
tmp.head()

(268, 59)


Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Cabin_A19,Cabin_A20,Cabin_A26,Cabin_A34,Cabin_A7,Cabin_B18,Cabin_B20,...,Cabin_E31,Cabin_E36,Cabin_E38,Cabin_E44,Cabin_E46,Cabin_E50,Cabin_E68,Cabin_F G73,Cabin_F2,Cabin_F33
590,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
628,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Note: get_dummies() can handle missing values

In [44]:
# we can add an additional feature to indicate
# missing data
pd.get_dummies(X_train['Embarked'], drop_first=True, dummy_na=True).head()

Unnamed: 0,Q,S,NaN
7,0,1,0
765,0,1,0
339,0,1,0
374,0,1,0
183,0,1,0


### Advantages
- quick
- returns pandas dataframe
= returns feature names for the dummy variables

### Limitations of pandas:
- it does not preserve information from train data to propagate to test data