In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [21]:
ds = pd.read_csv('train.csv',usecols=['Sex', 'Embarked',"Survived"])

In [22]:
ds.head()

Unnamed: 0,Survived,Sex,Embarked
0,0,male,S
1,1,female,C
2,1,female,S
3,1,female,S
4,0,male,S


In [23]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    ds[['Sex', 'Embarked']],  
    ds['Survived'],  
    test_size=0.3,  
    random_state=10)  

In [24]:
# Lets check some cardinality of training set

In [25]:
X_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [26]:
X_train['Embarked'].unique()


array(['S', 'Q', 'C', nan], dtype=object)

In [27]:
# we create and train the encoder
encoder = OneHotEncoder(categories='auto',
                       drop='first', # to return k-1, use drop=false to return k dummies
                       sparse=False,
                       handle_unknown='error') # helps deal with rare labels

encoder.fit(X_train.fillna('Missing'))

OneHotEncoder(drop='first', sparse=False)

In [28]:
# lets see that are the learned categories
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Missing', 'Q', 'S'], dtype=object)]

In [29]:
# transform the train set
tmp = encoder.transform(X_train.fillna('Missing'))
pd.DataFrame(tmp).head()

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0


In [30]:
# retrive the features
encoder.get_feature_names()



array(['x0_male', 'x1_Missing', 'x1_Q', 'x1_S'], dtype=object)

In [31]:
# Now, we go ahead and transfom the test set
# and then reconstitute it back to a pandas dataframe
# and add the feature names derived by OHE

tmp = encoder.transform(X_test.fillna('Missing'))
tmp = pd.DataFrame(tmp)
tmp.columns = encoder.get_feature_names()
tmp.head()



Unnamed: 0,x0_male,x1_Missing,x1_Q,x1_S
0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0


### Advantages
- quick
- Creates the same number of features in train and test set
### Limitations
- it returns a numpy array instead of a pandas dataframe
- it does not return the variable names, therefore it is inconvenient for variable exploration