# Feature selection.

We use feature selection after wrangling for two reasons.

1. Obtain a set of good features that represents the current dataset.
2. Obtain the set of *not good* features that should be refined in the next wrangling step.

This happens in three steps.

1. A first preselection step removes obviously bad features.
2. A second preselection step removes features that have the same predictive capabilities, in order to prevent the final feature selection step to select.
3. A real feature selection step to make the final decision.

The following methods are implemented in this notebook.

1. A (baseline) random sampling based approach — done.
2. CHCGA — a genetic algorithm based approach — done.
3. SFFS — a forward selection based approach — done.
4. AdaBoost with decision stump approach — TODO.

Both (1) and (2) allow use to set a max running time.

In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from typing import Optional, List, Tuple, Callable
from tqdm.notebook import tqdm
from avatar.language import WranglingLanguage
from avatar.analysis import *

Load dataset.

In [2]:
titanic = pd.read_csv("../data/raw/demo/titanic.csv")
titanic.Survived = titanic.Survived.astype("category")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Find transformed columns. Don't use replacement.

In [3]:
language = WranglingLanguage()
expanded = language.expand(titanic, target="Survived")
expanded

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,OneHot()(Parch)_5,OneHot()(Parch)_6,OneHot()(Embarked)_C,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,"NaN(Pernot, Mr. Rene)(Name)_Name","NaN(Somerton, Mr. Francis William)(Name)_Name",WordToNumber()(Ticket)_Ticket,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,0,0,0,0,1,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris",,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,1,0,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th...",,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,0,0,0,0,1,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina",,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,0,0,0,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803.0,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,0,0,0,0,1,"Allen, Mr. William Henry","Allen, Mr. William Henry",373450.0,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,0,0,0,0,1,"Montvila, Rev. Juozas","Montvila, Rev. Juozas",211536.0,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,0,0,0,0,1,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith",112053.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,...,0,0,0,0,1,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,0,0,1,0,0,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell",111369.0,C148,C


## Pruning

Remove some features that are not appropriate and don't need more wrangling.

In [4]:
from avatar.selection import *


pruner = StackedFilter([ConstantFilter(),
                        IdenticalFilter()])
pruned = pruner.select(expanded)
pruned

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,OneHot()(Parch)_5,OneHot()(Parch)_6,OneHot()(Embarked)_C,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,"NaN(Pernot, Mr. Rene)(Name)_Name","NaN(Somerton, Mr. Francis William)(Name)_Name",WordToNumber()(Ticket)_Ticket,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,0,0,0,0,1,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris",,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,1,0,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th...",,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,0,0,0,0,1,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina",,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,0,0,0,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803.0,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,0,0,0,0,1,"Allen, Mr. William Henry","Allen, Mr. William Henry",373450.0,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,0,0,0,0,1,"Montvila, Rev. Juozas","Montvila, Rev. Juozas",211536.0,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,0,0,0,0,1,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith",112053.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,...,0,0,0,0,1,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,0,0,1,0,0,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell",111369.0,C148,C


## Preselection

Preselect features that will never be appropriate. These can still be wrangled.

* Remove columns with too many missing values.
* Columns consisting of unique, categorical features are removed.

In [5]:
from avatar.selection import *
    
preselector = StackedFilter([BijectiveFilter(),
                             UniqueFilter(),
                             MissingFilter()])
preselected = preselector.select(expanded)
preselected

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,...,OneHot()(Parch)_3,OneHot()(Parch)_4,OneHot()(Parch)_5,OneHot()(Parch)_6,OneHot()(Embarked)_C,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,WordToNumber()(Ticket)_Ticket,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,S,...,0,0,0,0,0,0,1,,B96 B98,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,...,0,0,0,0,1,0,0,,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,S,...,0,0,0,0,0,0,1,,B96 B98,S
3,4,1,1,female,35.0,1,0,113803,53.1000,S,...,0,0,0,0,0,0,1,113803.0,C123,S
4,5,0,3,male,35.0,0,0,373450,8.0500,S,...,0,0,0,0,0,0,1,373450.0,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,211536,13.0000,S,...,0,0,0,0,0,0,1,211536.0,B96 B98,S
887,888,1,1,female,19.0,0,0,112053,30.0000,S,...,0,0,0,0,0,0,1,112053.0,B42,S
888,889,0,3,female,,1,2,W./C. 6607,23.4500,S,...,0,0,0,0,0,0,1,,B96 B98,S
889,890,1,1,male,26.0,0,0,111369,30.0000,C,...,0,0,0,0,1,0,0,111369.0,C148,C


We sample a subset of the data with at least one row containing no NaNs.

In [6]:
sampler = WeightedColumnSampler(preselected)
sampled = sampler.sample()
sampled

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Split(-)(Name)_0,Split(')(Name)_0,...,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked,Age,Embarked,Split( )(Name)_3,ExtractNumber()(Ticket)_0,"ExtractWord([Miss, Mr, Master, Dr, Rev, Mrs])(Name)_0",WordToNumber()(Ticket)_Ticket
0,1,0,3,male,1,0,A/5 21171,7.2500,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris",...,0,1,B96 B98,S,22.0,S,Harris,5.0,Mr,
1,2,1,1,female,1,0,PC 17599,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th...",...,0,0,C85,C,38.0,C,Bradley,17599.0,Mr,
2,3,1,3,female,0,0,STON/O2. 3101282,7.9250,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina",...,0,1,B96 B98,S,26.0,S,,2.0,Miss,
3,4,1,1,female,1,0,113803,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)",...,0,1,C123,S,35.0,S,Heath,113803.0,Mr,113803.0
4,5,0,3,male,0,0,373450,8.0500,"Allen, Mr. William Henry","Allen, Mr. William Henry",...,0,1,B96 B98,S,35.0,S,Henry,373450.0,Mr,373450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,0,0,211536,13.0000,"Montvila, Rev. Juozas","Montvila, Rev. Juozas",...,0,1,B96 B98,S,27.0,S,,211536.0,Rev,211536.0
887,888,1,1,female,0,0,112053,30.0000,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith",...,0,1,B42,S,19.0,S,Edith,112053.0,Miss,112053.0
888,889,0,3,female,1,2,W./C. 6607,23.4500,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",...,0,1,B96 B98,S,,S,Helen,6607.0,Miss,
889,890,1,1,male,0,0,111369,30.0000,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell",...,0,0,C148,C,26.0,C,Howell,111369.0,Mr,111369.0


Next, we look for features with the same predictive power using a wrapper approach. A decision stump is learned for each feature individually and the predictions for this stump are compared. Features that make the same predictions are pruned.

In [7]:
from avatar.selection import IterativeFilter


preselected = IterativeFilter().select(sampled, target="Survived")
preselected

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Split(-)(Name)_0,"Split(,)(Name)_1",...,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked,Age,Embarked,Split( )(Name)_3,ExtractNumber()(Ticket)_0,"ExtractWord([Miss, Mr, Master, Dr, Rev, Mrs])(Name)_0",WordToNumber()(Ticket)_Ticket
0,1,0,3,male,1,0,A/5 21171,7.2500,"Braund, Mr. Owen Harris",Mr. Owen Harris,...,0,1,B96 B98,S,22.0,S,Harris,5.0,Mr,
1,2,1,1,female,1,0,PC 17599,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs. John Bradley (Florence Briggs Thayer),...,0,0,C85,C,38.0,C,Bradley,17599.0,Mr,
2,3,1,3,female,0,0,STON/O2. 3101282,7.9250,"Heikkinen, Miss. Laina",Miss. Laina,...,0,1,B96 B98,S,26.0,S,,2.0,Miss,
3,4,1,1,female,1,0,113803,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs. Jacques Heath (Lily May Peel),...,0,1,C123,S,35.0,S,Heath,113803.0,Mr,113803.0
4,5,0,3,male,0,0,373450,8.0500,"Allen, Mr. William Henry",Mr. William Henry,...,0,1,B96 B98,S,35.0,S,Henry,373450.0,Mr,373450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,0,0,211536,13.0000,"Montvila, Rev. Juozas",Rev. Juozas,...,0,1,B96 B98,S,27.0,S,,211536.0,Rev,211536.0
887,888,1,1,female,0,0,112053,30.0000,"Graham, Miss. Margaret Edith",Miss. Margaret Edith,...,0,1,B42,S,19.0,S,Edith,112053.0,Miss,112053.0
888,889,0,3,female,1,2,W./C. 6607,23.4500,"Johnston, Miss. Catherine Helen ""Carrie""","Miss. Catherine Helen ""Carrie""",...,0,1,B96 B98,S,,S,Helen,6607.0,Miss,
889,890,1,1,male,0,0,111369,30.0000,"Behr, Mr. Karl Howell",Mr. Karl Howell,...,0,0,C148,C,26.0,C,Howell,111369.0,Mr,111369.0


### evaluation

Wrapping evaluation in a class saves the time of converting data for MERCS and allows us to reuse the same split in every iteration.

In [8]:
from avatar.analysis import FeatureEvaluator

mask = np.random.randint(2, size=len(sampled.columns))

evaluator = FeatureEvaluator(folds=1)
evaluator.fit(sampled, target="Survived")
evaluator.accuracy(mask)

0.6831460674157304

In [9]:
evaluator.importances(mask)

array([0.        , 0.        , 0.00512247, 0.29014086, 0.        ,
       0.00418681, 0.11167151, 0.08364107, 0.        , 0.0313011 ,
       0.03507748, 0.12918489, 0.04412734, 0.        , 0.02866417,
       0.        , 0.        , 0.        , 0.0337595 , 0.        ,
       0.03886733, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00421512,
       0.        , 0.        , 0.00535975, 0.        , 0.00214907,
       0.00190595, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.06259503, 0.        , 0.        , 0.08803056])

## Feature selection


Next, we can take a look at actual feature selection. Three wrapper methods are implemented.

* Randomly sampling columns, training a model and getting the feature relevances.
* A genetic approach, which is similar but should combine features slightly better. We perform a small experiment on whether to fix the genome size.
* Classic sequential and backwards sequential feature selection.

The idea is similar; the feature selection algorithms return sets of feature importances and the associated scores.

### Random

Randomly sample subsets of features, evaluate and get feature relevances.

In [15]:
from avatar.selection import SamplingSelector


ss = SamplingSelector(iterations=20)
ss.fit(sampled, target="Survived")

In [16]:
ss.scores()

array([3.44924082e-02, 0.00000000e+00, 1.33910103e-02, 3.17015378e-01,
       7.50273058e-03, 4.82208400e-03, 4.08166950e-02, 4.04877636e-02,
       1.83152679e-02, 1.04771342e-02, 3.03093306e-03, 1.45876877e-02,
       6.80776149e-02, 5.44970173e-03, 1.55130124e-02, 1.24034627e-02,
       4.98065926e-02, 1.26516102e-02, 3.14768677e-02, 2.36433776e-02,
       3.13726637e-02, 5.46410502e-03, 1.25850545e-02, 1.31918503e-03,
       4.59961665e-02, 2.06836665e-03, 1.24992409e-02, 1.57648652e-04,
       0.00000000e+00, 0.00000000e+00, 1.14636750e-03, 0.00000000e+00,
       9.15719326e-04, 1.60374740e-04, 4.66595921e-04, 0.00000000e+00,
       0.00000000e+00, 2.82485794e-04, 0.00000000e+00, 1.30965387e-04,
       2.31132768e-03, 7.05902687e-03, 7.14485801e-03, 7.89416595e-03,
       1.81211367e-02, 5.61461351e-03, 2.87865980e-02, 4.70413682e-02,
       1.61084656e-02, 2.13921675e-02])

In [17]:
ss.ordered()

Index(['Sex', 'Split( )(Name)_1', 'Split(,)(Name)_1',
       'ExtractNumber()(Ticket)_0', 'OneHot()(Pclass)_3', 'Ticket', 'Fare',
       'PassengerId', 'Split(/)(Ticket)_0', 'Split(./)(Ticket)_0',
       'Split( )(Name)_3', 'Split(.)(Ticket)_0',
       'WordToNumber()(Ticket)_Ticket', 'Split(.)(Name)_0', 'Age',
       'ExtractWord([Dr, Master, Miss, Rev, Mrs, Mr])(Name)_0',
       'Split(-)(Name)_0', 'Split( )(Name)_0', 'Pclass', 'Split( )(Ticket)_0',
       'OneHot()(Pclass)_1', 'OneHot()(SibSp)_1', 'Split(,)(Name)_0',
       'Split(.)(Name)_1', 'ModeImputation()(Embarked)_Embarked', 'SibSp',
       'ModeImputation()(Cabin)_Cabin', 'OneHot()(Embarked)_S', 'Embarked',
       'Split(. )(Ticket)_0', 'Split( )(Name)_2', 'Parch', 'Split(')(Name)_0',
       'OneHot()(Embarked)_Q', 'OneHot()(SibSp)_0', 'OneHot()(Pclass)_2',
       'OneHot()(SibSp)_5', 'OneHot()(Parch)_0', 'OneHot()(Parch)_2',
       'OneHot()(Parch)_5', 'OneHot()(Parch)_1', 'OneHot()(SibSp)_2',
       'OneHot()(Embarked)_C',

### Genetic

The CHC Genetic Algorithm for feature selection. Uses

* Cross-generational elitist selection
* Heterogeneous recombination
* and Cataclysmic mutation

for maintaining diversity and avoiding stagnation.

After the final population is obtained, combine importances from this population.

In [10]:
from avatar.selection import CHCGASelector, Population, Individual
    

gas = CHCGASelector(iterations=40)
gas.fit(preselected, target="Survived")

KeyboardInterrupt: 

In [19]:
gas.scores()

array([2.60595576e-03, 0.00000000e+00, 7.66667563e-02, 5.00365116e-01,
       6.74611830e-02, 1.88363097e-03, 1.89906182e-02, 2.88435918e-03,
       1.82362498e-02, 3.30708285e-03, 1.58330265e-02, 5.97178475e-05,
       6.61083587e-03, 1.08146542e-02, 2.03944971e-02, 8.16878293e-03,
       8.81785311e-03, 8.78311056e-03, 6.02337108e-03, 2.47739319e-02,
       0.00000000e+00, 6.67062813e-02, 2.69091666e-05, 6.61716210e-05,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.19842614e-03, 0.00000000e+00, 1.31540812e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 5.60196748e-03, 1.14148704e-02,
       1.63882755e-02, 2.72461257e-03, 3.42540000e-03, 1.99982710e-04,
       1.06825878e-02, 7.87522417e-02, 0.00000000e+00])

### SFFS

Sequential Forward Floating Selection. We don't use the adaptive version as there will often be many columns and that is too slow.

In [14]:
from avatar.selection import SFFSelector
    

sffs = SFFSelector(iterations=10)
sffs.fit(preselected, target="Survived")

[-1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1.]
Adding Split( )(Name)_1
Adding Split(.)(Ticket)_0
Adding OneHot()(Pclass)_2
Adding OneHot()(Parch)_0
Adding Sex
Adding OneHot()(Pclass)_3
Adding Split(.)(Ticket)_0
Adding PassengerId
Adding Fare
Adding PassengerId


In [None]:
sffs.scores()

### AdaBoost

In [31]:
selected.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Split(-)(Name)_0', 'Split(. )(Name)_0', 'Split(. )(Name)_1',
       'Split(')(Name)_0', 'Split(.)(Name)_1', 'Split(, )(Name)_0',
       'Split(, )(Name)_1', 'Split( )(Name)_0', 'Split( )(Name)_1',
       'Split( )(Name)_2', 'Split(,)(Name)_1', 'Split(./)(Ticket)_0',
       'Split(. )(Ticket)_0', 'Split(/)(Ticket)_0', 'Split(.)(Ticket)_0',
       'Split( )(Ticket)_0', 'Lowercase()(Ticket)_Ticket',
       'OneHot()(Pclass)_1', 'OneHot()(Pclass)_2', 'OneHot()(Pclass)_3',
       'OneHot()(Sex)_female', 'OneHot()(Sex)_male', 'OneHot()(SibSp)_0',
       'OneHot()(SibSp)_1', 'OneHot()(SibSp)_2', 'OneHot()(SibSp)_3',
       'OneHot()(SibSp)_4', 'OneHot()(SibSp)_5', 'OneHot()(SibSp)_8',
       'OneHot()(Parch)_0', 'OneHot()(Parch)_1', 'OneHot()(Parch)_2',
       'OneHot()(Parch)_3', 'OneHot()(Parch)_4', 'OneHot()(Parch)_5',
       'OneHot()(Parch)_6', 'OneHot()(Embarked)_C', 'OneHot()(Embarked)_Q',
  

In [32]:
sampled.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Split( (")(Name)_0', 'Split(" )(Name)_0',
       'Split(-)(Name)_0', 'Split( ()(Name)_0', 'Split(. )(Name)_0',
       'Split(. )(Name)_1', 'Split(/)(Name)_0', 'Split((")(Name)_0',
       'Split(")(Name)_0', 'Split(. ()(Name)_0', 'Split() )(Name)_0',
       'Split())(Name)_0', 'Split(')(Name)_0', 'Split(()(Name)_0',
       'Split() ()(Name)_0', 'Split(.)(Name)_0', 'Split(.)(Name)_1',
       'Split("))(Name)_0', 'Split() (")(Name)_0', 'Split(, )(Name)_0',
       'Split(, )(Name)_1', 'Split( ")(Name)_0', 'Split( )(Name)_0',
       'Split( )(Name)_1', 'Split( )(Name)_2', 'Split(,)(Name)_0',
       'Split(,)(Name)_1', 'Split(./)(Ticket)_0', 'Split(. )(Ticket)_0',
       'Split(/)(Ticket)_0', 'Split(.)(Ticket)_0', 'Split( )(Ticket)_0',
       'ExtractWord([male, female])(Sex)_0', 'Lowercase()(Name)_Name',
       'Lowercase()(Sex)_Sex', 'Lowercase()(Ticket)_Ticket',
       'OneHot()(Pclass)_