In [33]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder

In [35]:
ds = pd.read_csv('train.csv',usecols=['Sex', 'Embarked',"Survived"])

In [36]:
ds.head()

Unnamed: 0,Survived,Sex,Embarked
0,0,male,S
1,1,female,C
2,1,female,S
3,1,female,S
4,0,male,S


In [37]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    ds[['Sex', 'Embarked']],  
    ds['Survived'],  
    test_size=0.3,  
    random_state=10)  

In [38]:
# Lets check some cardinality of training set

In [39]:
X_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [40]:
X_train['Embarked'].unique()


array(['S', 'Q', 'C', nan], dtype=object)

In [42]:
# we create and train the encoder
ohe_enc = fe_OneHotEncoder(
    top_categories=None,
    variables=['Sex', 'Embarked'],  # we can select which variables to encode
    drop_last=True)  # to return k-1, false to return k

ohe_enc.fit(X_train.fillna('Missing'))

OneHotEncoder(drop_last=True, variables=['Sex', 'Embarked'])

In [43]:
tmp = ohe_enc.transform(X_train.fillna('Missing'))
tmp.head()

Unnamed: 0,Sex_male,Embarked_S,Embarked_Q,Embarked_C
7,1,1,0,0
765,0,1,0,0
339,1,1,0,0
374,0,1,0,0
183,1,1,0,0


Notice, feature-engine returns the dummy variables with their names, and drops the original variable, leaving the dataset ready for further exploration or building machine learning models.

In [44]:
tmp = ohe_enc.transform(X_test.fillna('Missing'))
tmp.head()

Unnamed: 0,Sex_male,Embarked_S,Embarked_Q,Embarked_C
590,1,1,0,0
131,1,1,0,0
628,1,1,0,0
195,0,0,0,1
230,0,1,0,0


In [45]:
# Feature-Engine's one hot encoder also selects
# all categorical variables automatically

ohe_enc = fe_OneHotEncoder(
    top_categories=None,
    drop_last=True)  # to return k-1, false to return k
    
ohe_enc.fit(X_train.fillna('Missing'))

OneHotEncoder(drop_last=True)

In [46]:
ohe_enc.variables_

['Sex', 'Embarked']

In [47]:
tmp = ohe_enc.transform(X_train.fillna('Missing'))
tmp.head()

Unnamed: 0,Sex_male,Embarked_S,Embarked_Q,Embarked_C
7,1,1,0,0
765,0,1,0,0
339,1,1,0,0
374,0,1,0,0
183,1,1,0,0


In [48]:
tmp = ohe_enc.transform(X_test.fillna('Missing'))
tmp.head()

Unnamed: 0,Sex_male,Embarked_S,Embarked_Q,Embarked_C
590,1,1,0,0
131,1,1,0,0
628,1,1,0,0
195,0,0,0,1
230,0,1,0,0


Note how this encoder returns a variable that are not present in the test set as well. Hence the number of column will be equal for both train and test set. This allows the integration with Scikit-learn pipeline and scoring of test set by the built algorithm.

### Advantages
- quick
- returns dataframe
- returns feature names
- allows to select features to encode