# Feature selection.

We use feature selection after wrangling for two reasons.

1. Obtain a set of good features that represents the current dataset.
2. Obtain the set of *not good* features that should be refined in the next wrangling step.

This happens in three steps.

1. A first preselection step removes obviously bad features.
2. A second preselection step removes features that have the same predictive capabilities, in order to prevent the final feature selection step to select.
3. A real feature selection step to make the final decision.

The following methods are implemented in this notebook.

1. A (baseline) random sampling based approach — done.
2. CHCGA — a genetic algorithm based approach — done.
3. SFFS — a forward selection based approach — done.
4. AdaBoost with decision stump approach — TODO.

Both (1) and (2) allow use to set a max running time.

In [40]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from typing import Optional, List, Tuple, Callable
from tqdm.notebook import tqdm
from avatar.language import WranglingLanguage
from avatar.analysis import *

Load dataset.

In [41]:
titanic = pd.read_csv("../data/raw/demo/titanic.csv")
titanic.Survived = titanic.Survived.astype("category")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Find transformed columns. Don't use replacement.

In [42]:
language = WranglingLanguage()
expanded = language.expand(titanic, target="Survived")
expanded

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,"Split( ("")(Name)_0","Split( ("")(Name)_1","Split( "")(Name)_0","Split( "")(Name)_1",Split())(Name)_0,Split())(Name)_1,Split())(Name)_2,Split(. ()(Name)_0,Split(. ()(Name)_1,"Split() ("")(Name)_0","Split() ("")(Name)_1",Split(')(Name)_0,Split(')(Name)_1,"Split(("")(Name)_0","Split(("")(Name)_1","Split("" )(Name)_0","Split("" )(Name)_1",Split() ()(Name)_0,Split() ()(Name)_1,Split(()(Name)_0,Split(()(Name)_1,Split(()(Name)_2,Split(-)(Name)_0,Split(-)(Name)_1,Split() )(Name)_0,Split() )(Name)_1,"Split(, )(Name)_0","Split(, )(Name)_1","Split("")(Name)_0","Split("")(Name)_1","Split("")(Name)_2",Split( ()(Name)_0,...,Split(. )(Ticket)_1,Split(./)(Ticket)_0,Split(./)(Ticket)_1,Split( )(Cabin)_0,Split( )(Cabin)_1,Split( )(Cabin)_2,Split( )(Cabin)_3,"ExtractWord([Mrs, Master, Dr, Rev, Mr, Miss])(Name)_0","ExtractWord([female, male])(Sex)_0","ExtractWord([LINE, SC, OQ, PC, C, CA, PP, P, A, O])(Ticket)_0","ExtractWord([C, G, E, B, A, F, D])(Cabin)_0","ExtractWord([Q, S, C])(Embarked)_0",Lowercase()(Name)_Name,Lowercase()(Sex)_Sex,Lowercase()(Ticket)_Ticket,Lowercase()(Cabin)_Cabin,Lowercase()(Embarked)_Embarked,OneHot()(Pclass)_1,OneHot()(Pclass)_2,OneHot()(Pclass)_3,OneHot()(Sex)_female,OneHot()(Sex)_male,OneHot()(SibSp)_0,OneHot()(SibSp)_1,OneHot()(SibSp)_2,OneHot()(SibSp)_3,OneHot()(SibSp)_4,OneHot()(SibSp)_5,OneHot()(SibSp)_8,OneHot()(Parch)_0,OneHot()(Parch)_1,OneHot()(Parch)_2,OneHot()(Parch)_3,OneHot()(Parch)_4,OneHot()(Parch)_5,OneHot()(Parch)_6,OneHot()(Embarked)_C,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,"NaN(Pernot, Mr. Rene)(Name)_Name","NaN(Somerton, Mr. Francis William)(Name)_Name",WordToNumber()(Ticket)_Ticket,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,,"Braund, Mr. Owen Harris",,"Braund, Mr. Owen Harris",,Braund,Mr. Owen Harris,"Braund, Mr. Owen Harris",,,"Braund, Mr. Owen Harris",...,,A/5 21171,,,,,,Mr,male,A,,S,"braund, mr. owen harris",male,a/5 21171,,s,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris",,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley",Florence Briggs Thayer),,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,Cumings,Mrs. John Bradley (Florence Briggs Thayer),"Cumings, Mrs. John Bradley (Florence Briggs Th...",,,"Cumings, Mrs. John Bradley",...,,PC 17599,,C85,,,,Mrs,female,PC,C,C,"cumings, mrs. john bradley (florence briggs th...",female,pc 17599,c85,c,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th...",,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,,"Heikkinen, Miss. Laina",,"Heikkinen, Miss. Laina",,Heikkinen,Miss. Laina,"Heikkinen, Miss. Laina",,,"Heikkinen, Miss. Laina",...,3101282,STON/O2. 3101282,,,,,,Miss,female,O,,S,"heikkinen, miss. laina",female,ston/o2. 3101282,,s,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina",,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel",,,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath",Lily May Peel),,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,Futrelle,Mrs. Jacques Heath (Lily May Peel),"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,,"Futrelle, Mrs. Jacques Heath",...,,113803,,C123,,,,Mrs,female,,C,S,"futrelle, mrs. jacques heath (lily may peel)",female,113803,c123,s,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803.0,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,,"Allen, Mr. William Henry",,"Allen, Mr. William Henry",,Allen,Mr. William Henry,"Allen, Mr. William Henry",,,"Allen, Mr. William Henry",...,,373450,,,,,,Mr,male,,,S,"allen, mr. william henry",male,373450,,s,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,"Allen, Mr. William Henry","Allen, Mr. William Henry",373450.0,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,,"Montvila, Rev. Juozas",,"Montvila, Rev. Juozas",,Montvila,Rev. Juozas,"Montvila, Rev. Juozas",,,"Montvila, Rev. Juozas",...,,211536,,,,,,Rev,male,,,S,"montvila, rev. juozas",male,211536,,s,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,"Montvila, Rev. Juozas","Montvila, Rev. Juozas",211536.0,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,,"Graham, Miss. Margaret Edith",,"Graham, Miss. Margaret Edith",,Graham,Miss. Margaret Edith,"Graham, Miss. Margaret Edith",,,"Graham, Miss. Margaret Edith",...,,112053,,B42,,,,Miss,female,,B,S,"graham, miss. margaret edith",female,112053,b42,s,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith",112053.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen","Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",,,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""",,Johnston,"Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen",Carrie,,"Johnston, Miss. Catherine Helen ""Carrie""",...,6607,W,C. 6607,,,,,Miss,female,C,,S,"johnston, miss. catherine helen ""carrie""",female,w./c. 6607,,s,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,,"Behr, Mr. Karl Howell",,"Behr, Mr. Karl Howell",,Behr,Mr. Karl Howell,"Behr, Mr. Karl Howell",,,"Behr, Mr. Karl Howell",...,,111369,,C148,,,,Mr,male,,C,C,"behr, mr. karl howell",male,111369,c148,c,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell",111369.0,C148,C


## Preselection

Preselect features that will never be appropriate.

* Remove columns with too many missing values.
* Constant columns are removed.
* Columns consisting of unique, categorical features are removed.
* Identical columns are collapsed into one.


In [45]:
from avatar.selection import *
    
preselector = StackedPreselector([ConstantPreselector(),
                                  IdenticalPreselector(),
                                  UniquePreselector(),
                                  MissingPreselector()])
preselected = preselector.select(expanded)
preselected

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Split(')(Name)_0,Split(-)(Name)_0,"Split(, )(Name)_0","Split(, )(Name)_1",Split(.)(Name)_0,Split(.)(Name)_1,Split( )(Name)_0,Split( )(Name)_1,Split( )(Name)_2,Split( )(Name)_3,Split(. )(Name)_1,"Split(,)(Name)_1",Split(.)(Ticket)_0,Split( )(Ticket)_0,Split(/)(Ticket)_0,Split(. )(Ticket)_0,Split(./)(Ticket)_0,"ExtractWord([Mrs, Master, Dr, Rev, Mr, Miss])(Name)_0",Lowercase()(Ticket)_Ticket,Lowercase()(Embarked)_Embarked,OneHot()(Pclass)_1,OneHot()(Pclass)_2,OneHot()(Pclass)_3,OneHot()(Sex)_female,OneHot()(Sex)_male,OneHot()(SibSp)_0,OneHot()(SibSp)_1,OneHot()(SibSp)_2,OneHot()(SibSp)_3,OneHot()(SibSp)_4,OneHot()(SibSp)_5,OneHot()(SibSp)_8,OneHot()(Parch)_0,OneHot()(Parch)_1,OneHot()(Parch)_2,OneHot()(Parch)_3,OneHot()(Parch)_4,OneHot()(Parch)_5,OneHot()(Parch)_6,OneHot()(Embarked)_C,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,WordToNumber()(Ticket)_Ticket,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,S,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris",Braund,Mr. Owen Harris,"Braund, Mr",Owen Harris,"Braund,",Mr.,Owen,Harris,Owen Harris,Mr. Owen Harris,A/5 21171,A/5,A,A/5 21171,A/5 21171,Mr,a/5 21171,s,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,,B96 B98,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th...",Cumings,Mrs. John Bradley (Florence Briggs Thayer),"Cumings, Mrs",John Bradley (Florence Briggs Thayer),"Cumings,",Mrs.,John,Bradley,John Bradley (Florence Briggs Thayer),Mrs. John Bradley (Florence Briggs Thayer),PC 17599,PC,PC 17599,PC 17599,PC 17599,Mrs,pc 17599,c,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,S,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina",Heikkinen,Miss. Laina,"Heikkinen, Miss",Laina,"Heikkinen,",Miss.,Laina,,Laina,Miss. Laina,STON/O2,STON/O2.,STON,STON/O2,STON/O2. 3101282,Miss,ston/o2. 3101282,s,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,,B96 B98,S
3,4,1,1,female,35.0,1,0,113803,53.1000,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)",Futrelle,Mrs. Jacques Heath (Lily May Peel),"Futrelle, Mrs",Jacques Heath (Lily May Peel),"Futrelle,",Mrs.,Jacques,Heath,Jacques Heath (Lily May Peel),Mrs. Jacques Heath (Lily May Peel),113803,113803,113803,113803,113803,Mrs,113803,s,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,113803.0,C123,S
4,5,0,3,male,35.0,0,0,373450,8.0500,S,"Allen, Mr. William Henry","Allen, Mr. William Henry",Allen,Mr. William Henry,"Allen, Mr",William Henry,"Allen,",Mr.,William,Henry,William Henry,Mr. William Henry,373450,373450,373450,373450,373450,Mr,373450,s,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,373450.0,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,211536,13.0000,S,"Montvila, Rev. Juozas","Montvila, Rev. Juozas",Montvila,Rev. Juozas,"Montvila, Rev",Juozas,"Montvila,",Rev.,Juozas,,Juozas,Rev. Juozas,211536,211536,211536,211536,211536,Rev,211536,s,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,211536.0,B96 B98,S
887,888,1,1,female,19.0,0,0,112053,30.0000,S,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith",Graham,Miss. Margaret Edith,"Graham, Miss",Margaret Edith,"Graham,",Miss.,Margaret,Edith,Margaret Edith,Miss. Margaret Edith,112053,112053,112053,112053,112053,Miss,112053,s,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,112053.0,B42,S
888,889,0,3,female,,1,2,W./C. 6607,23.4500,S,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",Johnston,"Miss. Catherine Helen ""Carrie""","Johnston, Miss","Catherine Helen ""Carrie""","Johnston,",Miss.,Catherine,Helen,"Catherine Helen ""Carrie""","Miss. Catherine Helen ""Carrie""",W,W./C.,W.,W./C,W,Miss,w./c. 6607,s,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,,B96 B98,S
889,890,1,1,male,26.0,0,0,111369,30.0000,C,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell",Behr,Mr. Karl Howell,"Behr, Mr",Karl Howell,"Behr,",Mr.,Karl,Howell,Karl Howell,Mr. Karl Howell,111369,111369,111369,111369,111369,Mr,111369,c,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,111369.0,C148,C


We sample a subset of the data with at least one row containing no NaNs.

In [46]:
sampler = WeightedColumnSampler(preselected)
sampled = sampler.sample()
sampled

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Split(')(Name)_0,Split(-)(Name)_0,"Split(, )(Name)_0","Split(, )(Name)_1",Split(.)(Name)_0,Split(.)(Name)_1,Split( )(Name)_0,Split( )(Name)_1,Split( )(Name)_2,Split(. )(Name)_1,"Split(,)(Name)_1",Split(.)(Ticket)_0,Split( )(Ticket)_0,Split(/)(Ticket)_0,Split(. )(Ticket)_0,Split(./)(Ticket)_0,Lowercase()(Ticket)_Ticket,OneHot()(Pclass)_1,OneHot()(Pclass)_2,OneHot()(Pclass)_3,OneHot()(Sex)_female,OneHot()(Sex)_male,OneHot()(SibSp)_0,OneHot()(SibSp)_1,OneHot()(SibSp)_2,OneHot()(SibSp)_3,OneHot()(SibSp)_4,OneHot()(SibSp)_5,OneHot()(SibSp)_8,OneHot()(Parch)_0,OneHot()(Parch)_1,OneHot()(Parch)_2,OneHot()(Parch)_3,OneHot()(Parch)_4,OneHot()(Parch)_5,OneHot()(Parch)_6,OneHot()(Embarked)_C,OneHot()(Embarked)_Q,OneHot()(Embarked)_S,ModeImputation()(Cabin)_Cabin,ModeImputation()(Embarked)_Embarked,Age,Embarked,Split( )(Name)_3,"ExtractWord([Mrs, Master, Dr, Rev, Mr, Miss])(Name)_0",Lowercase()(Embarked)_Embarked,WordToNumber()(Ticket)_Ticket
0,1,0,3,male,1,0,A/5 21171,7.2500,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris",Braund,Mr. Owen Harris,"Braund, Mr",Owen Harris,"Braund,",Mr.,Owen,Owen Harris,Mr. Owen Harris,A/5 21171,A/5,A,A/5 21171,A/5 21171,a/5 21171,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,B96 B98,S,22.0,S,Harris,Mr,s,
1,2,1,1,female,1,0,PC 17599,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th...",Cumings,Mrs. John Bradley (Florence Briggs Thayer),"Cumings, Mrs",John Bradley (Florence Briggs Thayer),"Cumings,",Mrs.,John,John Bradley (Florence Briggs Thayer),Mrs. John Bradley (Florence Briggs Thayer),PC 17599,PC,PC 17599,PC 17599,PC 17599,pc 17599,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,C85,C,38.0,C,Bradley,Mrs,c,
2,3,1,3,female,0,0,STON/O2. 3101282,7.9250,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina",Heikkinen,Miss. Laina,"Heikkinen, Miss",Laina,"Heikkinen,",Miss.,Laina,Laina,Miss. Laina,STON/O2,STON/O2.,STON,STON/O2,STON/O2. 3101282,ston/o2. 3101282,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,B96 B98,S,26.0,S,,Miss,s,
3,4,1,1,female,1,0,113803,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)",Futrelle,Mrs. Jacques Heath (Lily May Peel),"Futrelle, Mrs",Jacques Heath (Lily May Peel),"Futrelle,",Mrs.,Jacques,Jacques Heath (Lily May Peel),Mrs. Jacques Heath (Lily May Peel),113803,113803,113803,113803,113803,113803,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,C123,S,35.0,S,Heath,Mrs,s,113803.0
4,5,0,3,male,0,0,373450,8.0500,"Allen, Mr. William Henry","Allen, Mr. William Henry",Allen,Mr. William Henry,"Allen, Mr",William Henry,"Allen,",Mr.,William,William Henry,Mr. William Henry,373450,373450,373450,373450,373450,373450,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,B96 B98,S,35.0,S,Henry,Mr,s,373450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,0,0,211536,13.0000,"Montvila, Rev. Juozas","Montvila, Rev. Juozas",Montvila,Rev. Juozas,"Montvila, Rev",Juozas,"Montvila,",Rev.,Juozas,Juozas,Rev. Juozas,211536,211536,211536,211536,211536,211536,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,B96 B98,S,27.0,S,,Rev,s,211536.0
887,888,1,1,female,0,0,112053,30.0000,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith",Graham,Miss. Margaret Edith,"Graham, Miss",Margaret Edith,"Graham,",Miss.,Margaret,Margaret Edith,Miss. Margaret Edith,112053,112053,112053,112053,112053,112053,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,B42,S,19.0,S,Edith,Miss,s,112053.0
888,889,0,3,female,1,2,W./C. 6607,23.4500,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",Johnston,"Miss. Catherine Helen ""Carrie""","Johnston, Miss","Catherine Helen ""Carrie""","Johnston,",Miss.,Catherine,"Catherine Helen ""Carrie""","Miss. Catherine Helen ""Carrie""",W,W./C.,W.,W./C,W,w./c. 6607,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,B96 B98,S,,S,Helen,Miss,s,
889,890,1,1,male,0,0,111369,30.0000,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell",Behr,Mr. Karl Howell,"Behr, Mr",Karl Howell,"Behr,",Mr.,Karl,Karl Howell,Mr. Karl Howell,111369,111369,111369,111369,111369,111369,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,C148,C,26.0,C,Howell,Mr,c,111369.0


## Evaluation

How do we evaluate a set of features?

* Which **model** do we use?
* what **max depth** do we use?

Wrapping evaluation in a class saves the time of converting data for MERCS and allows us to reuse the same split in every iteration.

### Timing

Small experiment on runtime for decision trees of different depths.

In [81]:
from avatar.analysis import FeatureEvaluator


evaluator = FeatureEvaluator(sampled, method=None)
evaluator.importances(np.random.randint(2, size=len(sampled.columns)))
# evaluator.accuracy(np.random.randint(2, size=len(sampled.columns)))

invalid value encountered in double_scalars
invalid value encountered in double_scalars
invalid value encountered in double_scalars
invalid value encountered in double_scalars


AssertionError: 

In [38]:
(_ == 1).sum()

55

In [10]:
# import time
# import itertools

# rng = np.random.RandomState(1337)

# classifiers = ["DT"]
# depths = [1, 4, 8, 16, 32]
# iterations = 50

# mask = rng.randint(2, size=len(sampled.columns)).astype(bool)

# results = list()
# for classifier, depth in tqdm(list(itertools.product(classifiers, depths))):
#     start = time.time()

#     evaluator = FeatureEvaluator(sampled,
#                                  target="Survived",
#                                  classifier_algorithm=classifier,
#                                  max_depth=depth,
#                                  random_state=depth)

#     accuracies = list()
#     for i in range(iterations):
#         acc, fis = evaluator.evaluate(mask)
#         accuracies.append(acc)
#     results.append((classifier, depth, time.time() - start, np.mean(accuracies)))

# for classifier, depth, t, score in results: 
#     print("{}\t{}\t{}\t{}".format(classifier, depth, t, score))

Next, we look for features with the same predictive power using a wrapper approach. A decision stump is learned for each feature individually and the predictions for this stump are compared. Features that make the same predictions are pruned.

In [12]:
from pandas.api.types import is_numeric_dtype


class IterativePreselector:
    """Train decision stump for every feature individually."""
    
    def __init__(self, threshold: float = 0.95):
        """
        
        Args:
            threshold: Features that make the same prediction for
                `threshold` percentage of rows are discarded.

        """
        self.threshold = 1 - threshold

    def predictions(self, df: pd.DataFrame, target) -> pd.DataFrame:
        """Make predictions.
        
        Returns:
            Dataframe with same shape as `df` containing predictions for
            every feature.
            
        """
        
        # prepare data for mercs
        data, nominal = to_mercs(df)
        data = data.values
        data_test = np.nan_to_num(data)

        # initialise mask and quer
        base_m_code = to_m_codes(df, target)
        base_q_code = np.copy(base_m_code[0])
        base_m_code[base_m_code == 0] = -1 

        # perform predictions
        predictions = pd.DataFrame(0, index=df.index, columns=df.columns)
        for i, column in enumerate(df.columns):
            if column == target:
                continue
            m_code = np.copy(base_m_code)
            m_code[:, i] = 0
            model = Mercs(classifier_algorithm="DT", max_depth=1)
            model.fit(data, nominal_attributes=nominal, m_codes=m_code)
            predictions[column] = model.predict(data_test, q_code=base_q_code)

        return predictions
    
    def select(self, df: pd.DataFrame, target) -> pd.DataFrame:
        """Perform selection.
        
        Returns:
            A dataframe containing only selected features.
    
        """
        
        # get predictions
        predictions = self.predictions(df, target)
        
        # get columns similar to another columns
        similar = set()
        for i, ci in enumerate(predictions.columns):
            if ci in similar:
                break
            column = predictions[ci]
            column_type = is_numeric_dtype(selected[ci])
            for cj in predictions.columns[i + 1:]:
                if cj in similar:
                    break
                other = predictions[cj]
                other_type = is_numeric_dtype(selected[cj])
                if ((    column_type and     other_type) or
                    (not column_type and not other_type)):
                    d = np.abs(column - other).sum() / len(column)
                    if d < self.threshold:
                        similar.add(cj)
        
        # similar ones and return
        return df.drop(similar, axis=1)


preselected = IterativePreselector().select(selected, "Survived")
preselected

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Parch,Ticket,Split(-)(Name)_0,Split(. )(Name)_0,Split(. )(Name)_1,Split(')(Name)_0,...,OneHot()(Pclass)_3,OneHot()(Sex)_female,OneHot()(Sex)_male,OneHot()(SibSp)_1,OneHot()(Parch)_0,OneHot()(Parch)_1,OneHot()(Embarked)_C,OneHot()(Embarked)_S,Embarked,Lowercase()(Embarked)_Embarked
0,1,0,3,male,0,A/5 21171,"Braund, Mr. Owen Harris","Braund, Mr",Owen Harris,"Braund, Mr. Owen Harris",...,1,0,1,1,1,0,0,1,S,s
1,2,1,1,female,0,PC 17599,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs",John Bradley (Florence Briggs Thayer),"Cumings, Mrs. John Bradley (Florence Briggs Th...",...,0,1,0,1,1,0,1,0,C,c
2,3,1,3,female,0,STON/O2. 3101282,"Heikkinen, Miss. Laina","Heikkinen, Miss",Laina,"Heikkinen, Miss. Laina",...,1,1,0,0,1,0,0,1,S,s
3,4,1,1,female,0,113803,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs",Jacques Heath (Lily May Peel),"Futrelle, Mrs. Jacques Heath (Lily May Peel)",...,0,1,0,1,1,0,0,1,S,s
4,5,0,3,male,0,373450,"Allen, Mr. William Henry","Allen, Mr",William Henry,"Allen, Mr. William Henry",...,1,0,1,0,1,0,0,1,S,s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,0,211536,"Montvila, Rev. Juozas","Montvila, Rev",Juozas,"Montvila, Rev. Juozas",...,0,0,1,0,1,0,0,1,S,s
887,888,1,1,female,0,112053,"Graham, Miss. Margaret Edith","Graham, Miss",Margaret Edith,"Graham, Miss. Margaret Edith",...,0,1,0,0,1,0,0,1,S,s
888,889,0,3,female,2,W./C. 6607,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss","Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie""",...,1,1,0,1,0,0,0,1,S,s
889,890,1,1,male,0,111369,"Behr, Mr. Karl Howell","Behr, Mr",Karl Howell,"Behr, Mr. Karl Howell",...,0,0,1,0,1,0,1,0,C,c


## Feature selection


Next, we can take a look at actual feature selection. Three methods are implemented.

* Randomly sampling columns, training a model and getting the feature relevances.
* A genetic approach, which is similar but should combine features slightly better. We perform a small experiment on whether to fix the genome size.
* Classic sequential and backwards sequential feature selection.
* AdaBoost with decision stumps.

In [13]:
class Selector:
    
    def __init__(self, df: pd.DataFrame, target):
        self._df = df
        self._target = target
        self._importances = np.zeros_like(df.columns, dtype=float)
        self._evaluator = FeatureEvaluator(df, target)
    
    @abstractmethod
    def rank(self):
        """Rank features."""
        pass
    
    def ordered(self):
        return np.argsort(self._importances)
    
    def select(self, n: int):
        return self._df.columns[self.ordered()[:n]]

### Random

Randomly sample subsets of features, evaluate and get feature relevance. Weight relevances over different iterations.

In [14]:
def sample_features(df: pd.DataFrame, max_features: int):
    """Sample some features."""
    pass


class SamplingSelector(Selector):
    """Randomly sample sets of features and obtain feature relevances."""
    
    def __init__(self, df: pd.DataFrame, target, iterations: int = 20):
        super().__init__(df, target)
        self._iterations = iterations
        self._rng = np.random.RandomState(1336)
    
    def rank(self):
                        
        importances = np.zeros((self._iterations, len(self._df.columns)))
        counts = np.zeros_like(self._importances)
        
        for i in range(self._iterations):
            mask = self.generate_mask()
            importances[i] = self._evaluator.importances(mask)
            counts = counts + mask
    
        self._importances = np.divide(importances.sum(0), counts)
    
    def generate_mask(self):
        return self._rng.randint(2, size=len(self._df.columns))
            

ss = SamplingSelector(preselected, target="Survived", iterations=50)
ss.rank()
ss.ordered()

array([ 1, 30, 31, 28, 32, 29,  4, 23, 27, 33, 14, 18, 15, 24, 17,  5,  7,
        8, 11, 22,  6, 10, 21,  2, 20, 13,  9, 19, 12,  0, 16, 25, 26,  3])

### Genetic

The CHC Genetic Algorithm for feature selection. Uses

* Cross-generational elitist selection
* Heterogeneous recombination
* and Cataclysmic mutation

for maintaining diversity and avoiding stagnation.

In [15]:
import random


class Individual: 
    
    def __init__(self, genome: np.ndarray):
        self._genome = genome
    
    def cross(self, other: "Individual", threshold: int) -> Optional[Tuple["Individual"]]:
        """Perform Half-Uniform Crossover (HUX).
    
        Crosses half of the non-matching alleles.

        Args:
            threshold: Minimal number of different alleles
                in order to allow crossing two individuals.
        
        Returns:
            If at least threshold alleles are different,
            return a tuple of offspring. Else, return None.

        """
        assert threshold < len(self.genome)
        assert len(self.genome) == len(other.genome)
        # get locations where differ
        (different,) = np.where(self.genome != other.genome)
        # don't cross over
        if len(different) // 2 < threshold:
            return None
        # select chromosomes to swamp
        indices = np.random.choice(different, len(different) // 2, replace=False)
        # make children
        c1 = np.copy(self.genome)
        c1[indices] = other.genome[indices]
        c2 = np.copy(other.genome)
        c2[indices] = self.genome[indices]
        return Individual(c1), Individual(c2)
    
    def mutate(self, p: float) -> "Individual":
        """Mutation.
        
        Args:
            p: Percentage of bits to flip.
    
        """
        assert p < 1
        indices = np.random.choice(
            np.arange(len(self.genome)),
            int(p * len(self.genome)),
            replace=False
        )
        genome = np.copy(self.genome)
        genome[indices] = 1 - genome[indices]
        return Individual(genome)
        
    @property
    def genome(self):
        return self._genome
    
    @classmethod
    def generate(cls, n: int) -> "Individual":
        """Generate random genome."""
        return Individual(np.random.randint(2, size=n))

    @classmethod
    def generate_for(cls, df: pd.DataFrame) -> "Individual":
        """Generate for a dataframe."""
        return cls.generate(df.shape[1])
    
    def __len__(self):
        return len(self._genome)
    
    def __str__(self):
        return str(self._genome)
    
    def __hash__(self):
        return hash(self._genome.tobytes())

    
class Population:
    """Population."""
    
    def __init__(
            self, population: List[Individual], fitness: Callable[[Individual], float]
    ):
        """
        
        Args:
            population: Initial population.
            fitness: Fitness function.

        """
        self._population = {individual: fitness(individual.genome) for individual in population}
        self._n = len(self._population)
        self._fitness = fitness
        self._threshold = len(population[0]) // 4
    
    def evolve(self):
        """Evolve to next generation.
        
        Args:
            evaluate: Function that evaluates an individual.
    
        """ 
        
        # sample current population without replacement by shuffling
        # and popping two by two.
        shuffled = random.sample(list(self._population), len(self._population))
        
        # generate children
        children = list()
        while len(shuffled) > 0:
            p1 = shuffled.pop()
            p2 = shuffled.pop()
            offspring = p1.cross(p2, self._threshold)
            if offspring is not None:
                children.extend(offspring)
        
        print(children)
        
        # drop threshold if no offspring
        if len(children) == 0:
            self._threshold = self._threshold - 1
            
            # threshold drops to 0, perform cataclysmic mutation
            if self._theshold == 0:
                best = self.best()
                children = [best]
                while len(children) < self._n:
                    children.append(best.mutate())
        
        # collect all individuals of current iteration and
        # evaluate them
        combined = {child: self._fitness(child.genome) for child in children}
        combined.update(self._population)
        
        # elitist selection
        ranked = sorted(combined, key=combined.get, reverse=True)
        elites = ranked[:self._n]
        
        # replace current population
        self._population = {
            elite: combined[elite] for elite in elites
        }
    
    def evaluate(self, individual):
        return {individual: evaluator(individual) for individual in self._population}

    def best(self) -> Individual:
        return max(self._population, key=self._population.get)
    
    def best_n(self, n) -> List[Individual]:
        pass
    
    def average(self) -> float:
        return np.mean(list(self._population.values()))

    @property
    def individuals(self):
        return self._population
    
    @property
    def n(self):
        return self._n
        
    @classmethod
    def generate_for(self, df: pd.DataFrame, n: int, fitness) -> "Population":
        return Population([Individual.generate_for(df) for _ in range(n)], fitness)
    
    def __str__(self):
        return "\n".join("{} {}".format(i, s) for i, s in self._population.items())
    

    
class CHCGASelector(Selector):
    """Selector using GA."""
    
    def __init__(self, df, target):
        super().__init__(df, target)
    
    def rank(self):
        pass
    

evaluator = FeatureEvaluator(preselected, target="Survived")
population = Population.generate_for(preselected, n=20, fitness=evaluator.accuracy)

In [16]:
print(population)

[1 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1] 0.7528089887640449
[1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 0 1 0] 0.7752808988764045
[0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0] 0.7640449438202247
[1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 1] 0.8089887640449438
[0 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 0 0 0] 0.8089887640449438
[1 0 0 1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0] 0.7752808988764045
[0 0 1 0 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0] 0.8202247191011236
[1 1 1 1 0 0 1 1 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1] 0.7752808988764045
[1 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1] 0.7415730337078652
[0 0 0 1 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1 0 1 0 1 0 1 0 1 1 0] 0.7191011235955056
[0 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1] 0.7303370786516854
[1 1 0 1 1 1 0 0 0 0 

### SFFS

Sequential Forward Floating Selection. We don't use the adaptive version as there will often be many columns and that is too slow.

In [28]:
class SFFSelector(Selector):
    """Sequential Forward Floating Selection."""
    
    def __init__(self, df: pd.DataFrame, target, iterations: int = 20):
        super().__init__(df, target)
        self._iterations = iterations
        # initialise the mask to exclude everything
        self._mask = to_m_codes(df, target=target)[0]
        self._mask[self._mask == 0] = -1
        # mappings of number of features to best set found
        # with that number
        self._best = {}
        self._best_score = {}
    
    def rank(self, iterations=50):
        """Perform SFFS."""
        
        k = 0
        added = list()
        removed = list()
        
        for _ in range(iterations):
            
            # forward step (SFS)
            best, score = self.forward()
            self.add(best)
            added.append(best)
            k += 1
            print("Adding {}: {}".format(best, self._df.columns[best]))
            
            if k not in self._best:
                self._best[k] = self.mask()
                self._best_score[k] = score
    
            # don't perform backward if only one feature to remove
            if k < 2:
                continue
        
            # start conditional exclusion
            while True:
    
                # backward step (SBS)
                best, score = self.backward()
            
                # best (k - 1) subset so far
                if score > self._best_score[k - 1]:
                    self.remove(best)
                    removed.append(best)
                    k -= 1
                    self._best[k] = self.mask()
                    self._best_score[k] = score
                    print("Removing {}: {}".format(best, self._df.columns[best]))
                
                # back to start of algorithm
                else:
                    break
            
        print(pd.Series(added).value_counts())
        print(pd.Series(removed).value_counts())
        print(self._best)
        print(self._best_score)
        
        
                   
    def forward(self):
        """Perform forward step."""
        (to_add,) = np.where(self._mask == -1)
        scores = {f: self._evaluator.accuracy(self.mask(add=f)) for f in to_add}
        best = max(scores, key=scores.get)
        return best, scores[best]
        
    
    def backward(self):
        """Perform backward step."""
        (to_remove,) = np.where(self._mask == 0)
        scores = {f: self._evaluator.accuracy(self.mask(remove=f)) for f in to_remove}
        best = max(scores, key=scores.get)
        return best, scores[best]

    def add(self, feature):
        self._mask[feature] = 0
    
    def remove(self, feature):
        self._mask[feature] = -1
    
    def mask(self, add=None, remove=None):
        """Get mask with one update."""
        mask = np.copy(self._mask)
        if add is not None:
            mask[add] = 0
        if remove is not None:
            mask[remove] = -1
        return mask
    

sffs = SFFSelector(preselected, target="Survived")
sffs.rank(iterations=50)

Adding 14: Split( )(Name)_1
Adding 4: Parch
Adding 28: OneHot()(Parch)_0
Adding 29: OneHot()(Parch)_1
Adding 3: Sex
Adding 25: OneHot()(Sex)_female
Adding 26: OneHot()(Sex)_male
Adding 27: OneHot()(SibSp)_1
Adding 30: OneHot()(Embarked)_C
Adding 2: Pclass
Removing 27: OneHot()(SibSp)_1
Adding 31: OneHot()(Embarked)_S
Removing 3: Sex
Removing 4: Parch
Removing 25: OneHot()(Sex)_female
Removing 28: OneHot()(Parch)_0
Removing 30: OneHot()(Embarked)_C
Adding 3: Sex
Adding 4: Parch
Adding 23: OneHot()(Pclass)_1
Adding 24: OneHot()(Pclass)_3
Adding 25: OneHot()(Sex)_female
Adding 28: OneHot()(Parch)_0
Removing 2: Pclass
Adding 2: Pclass
Adding 30: OneHot()(Embarked)_C
Adding 32: Embarked
Adding 33: Lowercase()(Embarked)_Embarked
Adding 27: OneHot()(SibSp)_1
Adding 6: Split(-)(Name)_0
Adding 15: Split( )(Name)_2
Removing 2: Pclass
Removing 4: Parch
Removing 3: Sex
Adding 3: Sex
Adding 2: Pclass
Removing 25: OneHot()(Sex)_female
Adding 7: Split(. )(Name)_0
Adding 25: OneHot()(Sex)_female
Remov

In [29]:
preselected['Split( )(Name)_1']

0        Mr.
1       Mrs.
2      Miss.
3       Mrs.
4        Mr.
       ...  
886     Rev.
887    Miss.
888    Miss.
889      Mr.
890      Mr.
Name: Split( )(Name)_1, Length: 891, dtype: object

### AdaBoost

In [31]:
selected.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Split(-)(Name)_0', 'Split(. )(Name)_0', 'Split(. )(Name)_1',
       'Split(')(Name)_0', 'Split(.)(Name)_1', 'Split(, )(Name)_0',
       'Split(, )(Name)_1', 'Split( )(Name)_0', 'Split( )(Name)_1',
       'Split( )(Name)_2', 'Split(,)(Name)_1', 'Split(./)(Ticket)_0',
       'Split(. )(Ticket)_0', 'Split(/)(Ticket)_0', 'Split(.)(Ticket)_0',
       'Split( )(Ticket)_0', 'Lowercase()(Ticket)_Ticket',
       'OneHot()(Pclass)_1', 'OneHot()(Pclass)_2', 'OneHot()(Pclass)_3',
       'OneHot()(Sex)_female', 'OneHot()(Sex)_male', 'OneHot()(SibSp)_0',
       'OneHot()(SibSp)_1', 'OneHot()(SibSp)_2', 'OneHot()(SibSp)_3',
       'OneHot()(SibSp)_4', 'OneHot()(SibSp)_5', 'OneHot()(SibSp)_8',
       'OneHot()(Parch)_0', 'OneHot()(Parch)_1', 'OneHot()(Parch)_2',
       'OneHot()(Parch)_3', 'OneHot()(Parch)_4', 'OneHot()(Parch)_5',
       'OneHot()(Parch)_6', 'OneHot()(Embarked)_C', 'OneHot()(Embarked)_Q',
  

In [32]:
sampled.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Split( (")(Name)_0', 'Split(" )(Name)_0',
       'Split(-)(Name)_0', 'Split( ()(Name)_0', 'Split(. )(Name)_0',
       'Split(. )(Name)_1', 'Split(/)(Name)_0', 'Split((")(Name)_0',
       'Split(")(Name)_0', 'Split(. ()(Name)_0', 'Split() )(Name)_0',
       'Split())(Name)_0', 'Split(')(Name)_0', 'Split(()(Name)_0',
       'Split() ()(Name)_0', 'Split(.)(Name)_0', 'Split(.)(Name)_1',
       'Split("))(Name)_0', 'Split() (")(Name)_0', 'Split(, )(Name)_0',
       'Split(, )(Name)_1', 'Split( ")(Name)_0', 'Split( )(Name)_0',
       'Split( )(Name)_1', 'Split( )(Name)_2', 'Split(,)(Name)_0',
       'Split(,)(Name)_1', 'Split(./)(Ticket)_0', 'Split(. )(Ticket)_0',
       'Split(/)(Ticket)_0', 'Split(.)(Ticket)_0', 'Split( )(Ticket)_0',
       'ExtractWord([male, female])(Sex)_0', 'Lowercase()(Name)_Name',
       'Lowercase()(Sex)_Sex', 'Lowercase()(Ticket)_Ticket',
       'OneHot()(Pclass)_