# Pruning

In this notebook, we implement some unsupervised feature selection algorithms and hope that they're faster than learning the full model.

In [3]:
%load_ext autoreload
%autoreload 2

### Data

Load the titanic dataset and add some additional columns.

In [4]:
import avatar.language
import pandas as pd

titanic = pd.read_csv("../data/raw/demo/titanic.csv")
language = avatar.language.WranglingLanguage()

In [5]:
transformations = language.transformations(titanic)

In [6]:
import tqdm

titanic_new = titanic
for transformation in tqdm.tqdm(transformations):
    titanic_new = transformation(titanic_new)
titanic_new.shape

100%|██████████| 67/67 [00:00<00:00, 148.83it/s]


(891, 974)

In [8]:
titanic_new

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,NaN(C78)(Cabin)_Cabin,NaN(E8)(Cabin)_Cabin,NaN(C93)(Cabin)_Cabin,NaN(B22)(Cabin)_Cabin,NaN(B18)(Cabin)_Cabin,NaN(B35)(Cabin)_Cabin,NaN(S)(Embarked)_Embarked,NaN(C)(Embarked)_Embarked,NaN(Q)(Embarked)_Embarked,WordToNumber()(Ticket)_Ticket
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,,,,,,,,S,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C85,C85,C85,C85,C85,C85,C,,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,,,,,,,,S,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,C123,C123,C123,C123,C123,C123,,S,S,113803.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,,,,,,,,S,S,373450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,,,,,,,,S,S,211536.0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,B42,B42,B42,B42,B42,B42,,S,S,112053.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,...,,,,,,,,S,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,C148,C148,C148,C148,C148,C148,C,,C,111369.0


## Categocial pruning

Prune columns that are categorical and contain too many different values.

In [9]:
from avatar.prune import CADPruner, NaNPruner

cadp = CADPruner(0.90)
nanp = NaNPruner(0.75)

titanic_new = nanp(cadp(titanic_new))
titanic_new

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,...,NaN(A/4 48871)(Ticket)_Ticket,NaN(113572)(Ticket)_Ticket,NaN(PC 17758)(Ticket)_Ticket,NaN(237736)(Ticket)_Ticket,NaN(31027)(Ticket)_Ticket,NaN(110465)(Ticket)_Ticket,NaN(S)(Embarked)_Embarked,NaN(C)(Embarked)_Embarked,NaN(Q)(Embarked)_Embarked,WordToNumber()(Ticket)_Ticket
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,S,...,A/5 21171,A/5 21171,A/5 21171,A/5 21171,A/5 21171,A/5 21171,,S,S,
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,...,PC 17599,PC 17599,PC 17599,PC 17599,PC 17599,PC 17599,C,,C,
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,S,...,STON/O2. 3101282,STON/O2. 3101282,STON/O2. 3101282,STON/O2. 3101282,STON/O2. 3101282,STON/O2. 3101282,,S,S,
3,4,1,1,female,35.0,1,0,113803,53.1000,S,...,113803,113803,113803,113803,113803,113803,,S,S,113803.0
4,5,0,3,male,35.0,0,0,373450,8.0500,S,...,373450,373450,373450,373450,373450,373450,,S,S,373450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,211536,13.0000,S,...,211536,211536,211536,211536,211536,211536,,S,S,211536.0
887,888,1,1,female,19.0,0,0,112053,30.0000,S,...,112053,112053,112053,112053,112053,112053,,S,S,112053.0
888,889,0,3,female,,1,2,W./C. 6607,23.4500,S,...,W./C. 6607,W./C. 6607,W./C. 6607,W./C. 6607,W./C. 6607,W./C. 6607,,S,S,
889,890,1,1,male,26.0,0,0,111369,30.0000,C,...,111369,111369,111369,111369,111369,111369,C,,C,111369.0


In [91]:
titanic.apply(lambda x: pd.factorize(x)[0]).corr()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,1.0,-0.005007,0.018305,1.0,-0.042939,0.12015,-0.031023,0.001206,0.760875,0.299982,0.241918,-0.030323
Survived,-0.005007,1.0,0.247845,-0.005007,0.543351,0.042743,-0.136302,0.08621,-0.047298,0.191981,0.270495,0.101849
Pclass,0.018305,0.247845,1.0,0.018305,0.118133,0.180735,-0.14963,-0.000343,0.020431,0.079498,0.187437,-0.168809
Name,1.0,-0.005007,0.018305,1.0,-0.042939,0.12015,-0.031023,0.001206,0.760875,0.299982,0.241918,-0.030323
Sex,-0.042939,0.543351,0.118133,-0.042939,1.0,0.009117,-0.046152,0.242417,-0.132709,0.137725,0.082104,0.111249
Age,0.12015,0.042743,0.180735,0.12015,0.009117,1.0,0.003777,0.13616,0.070385,0.114674,0.165159,-0.152716
SibSp,-0.031023,-0.136302,-0.14963,-0.031023,-0.046152,0.003777,1.0,0.165255,-0.122426,-0.046556,-0.108709,-0.030612
Parch,0.001206,0.08621,-0.000343,0.001206,0.242417,0.13616,0.165255,1.0,-0.262607,0.074306,-0.002761,-0.082982
Ticket,0.760875,-0.047298,0.020431,0.760875,-0.132709,0.070385,-0.122426,-0.262607,1.0,0.303257,0.212438,-0.020135
Fare,0.299982,0.191981,0.079498,0.299982,0.137725,0.114674,-0.046556,0.074306,0.303257,1.0,0.379659,0.077933


## Unsupervised Feature Selection

We focus on filter methods as they are faster than wrappers. These methods typically use.

In [None]:
def to_numerical(df):
    """Make dataframe features numerical by 1H encoding."""
    ndf = pd.DataFrame()
    n2o = dict()
    for i, column in enumerate(df):
        if df[column].dtype == "object":
            # create new dataframe
            new = pd.get_dummies(df[column])
            ndf = pd.concat((ndf, new), axis=1)
            # map new column to old ones
            for new_column in new:
                n2o[new_column] = column
        else:
            ndf[column] = df[column]
            n2o[column] = column
    # make numerical
    n2o = {ndf.columns.get_loc(k): df.columns.get_loc(v) for k, v in n2o.items()}
    return ndf.values, n2o       
    

titanic_num, n2o = to_numerical(cadp(titanic_new))
titanic_num

### Principal Feature Analysis

Using [this implementation](https://stats.stackexchange.com/a/203978) based on the idea from [this paper](http://venom.cs.utsa.edu/dmz/techrep/2007/CS-TR-2007-011.pdf). Downside is that number of features needs to be chosen and all data has to be numerical.

In [39]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from collections import defaultdict


class PFA(object):
    
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features

    def fit(self, X):
    
        if not self.q:
            self.q = X.shape[1]

        # missing values
        X = np.nan_to_num(X)
            
        sc = StandardScaler()
        X = sc.fit_transform(X)

        pca = PCA(n_components=self.q).fit(X)
        A_q = pca.components_.T

        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]

In [44]:
pfa = PFA(n_features=50)
pfa.fit(titanic_num)

In [45]:
pfa.indices_

[472,
 61,
 17,
 54,
 306,
 181,
 506,
 296,
 439,
 16,
 151,
 146,
 387,
 129,
 366,
 214,
 596,
 233,
 33,
 156,
 475,
 288,
 58,
 243,
 64,
 384,
 282,
 74,
 76,
 643,
 800,
 112,
 257,
 118,
 136,
 344,
 687,
 204,
 582,
 681,
 287,
 621,
 298,
 747,
 404,
 813,
 457,
 561,
 589,
 822]

Next, we have some algorithms from the [`skfeature`](http://featureselection.asu.edu/index.php) package.

### SPEC

In [46]:
from skfeature.function.similarity_based import SPEC

In [50]:
# specify the second ranking function which uses all except the 1st eigenvalue
kwargs = {'style': 0}

# obtain the scores of features
score = SPEC.spec(np.nan_to_num(titanic_num), **kwargs)

In [52]:
idx = SPEC.feature_ranking(score, **kwargs)
idx

array([287, 293, 372, 357, 127, 236, 481, 363, 651, 482, 659, 653, 183,
        86, 382, 384, 181, 225, 495, 375, 374, 628, 489, 833, 458, 457,
       406, 343, 179, 281, 252, 201, 452, 366, 291, 370, 429, 280, 572,
       646, 475, 512, 220, 838, 469, 460, 184, 275, 500, 634, 380, 411,
         3,   4, 837, 534, 650, 839, 315, 157, 625, 165, 472, 361,  84,
       391, 214, 187, 526, 657, 282, 674,   1, 167, 832, 623, 519, 444,
       450, 456, 533, 499, 195, 503, 390, 351, 399, 479, 560, 638, 529,
       445, 501, 405,   6, 200, 368, 680, 133, 209, 564, 276, 491, 541,
       318, 115, 454, 258, 285, 350, 316, 386,   7, 570,   2, 296, 376,
         5, 354, 433, 829, 272, 667, 420, 627, 158, 125, 193, 319, 588,
       544, 573, 109, 440, 520, 810, 178, 575, 689, 441, 410,   0, 196,
       418, 655, 581, 266, 490, 486, 409, 455, 379, 295, 320, 547, 332,
       191, 154, 213, 367, 346, 328, 682, 221, 446, 340, 687, 113, 207,
       128, 640, 263, 447, 305, 349, 210, 333, 666, 253, 215, 10

## Supervised

In [2]:
import time
from mercs.core import Mercs
from avatar.analysis import *

We sample random columns to generate a dataset in which there exists at least one row without NaN in order to make MERCS work.

In [87]:
sampler = UniformColumnSampler(titanic_new)
sampler.sample()

Unnamed: 0,Embarked,Split( )(Ticket)_1,NaN(male)(Sex)_Sex,NaN(347088)(Ticket)_Ticket,NaN(3101295)(Ticket)_Ticket,NaN(349909)(Ticket)_Ticket,NaN(17421)(Ticket)_Ticket,NaN(19950)(Ticket)_Ticket,NaN(LINE)(Ticket)_Ticket,NaN(347742)(Ticket)_Ticket,...,NaN(PC 17477)(Ticket)_Ticket,NaN(WE/P 5735)(Ticket)_Ticket,NaN(367230)(Ticket)_Ticket,NaN(248738)(Ticket)_Ticket,NaN(26360)(Ticket)_Ticket,NaN(11767)(Ticket)_Ticket,NaN(17453)(Ticket)_Ticket,NaN(PC 17758)(Ticket)_Ticket,NaN(237736)(Ticket)_Ticket,NaN(31027)(Ticket)_Ticket
0,True,True,False,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,False,False,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,True,False,False,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
887,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
888,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
889,True,False,False,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [115]:
wsampler = WeightedColumnSampler(titanic_new)
wsampler.weights

PassengerId                      0.000000
Survived                         0.000000
Pclass                           0.000000
Sex                              0.000000
Age                              0.796357
                                   ...   
NaN(110465)(Ticket)_Ticket       0.106642
NaN(S)(Embarked)_Embarked        0.995540
NaN(C)(Embarked)_Embarked        0.790149
NaN(Q)(Embarked)_Embarked        0.672245
WordToNumber()(Ticket)_Ticket    0.836655
Length: 1030, dtype: float64

In [114]:
pd.Series(0, index=titanic_new.columns)

PassengerId                      0
Survived                         0
Pclass                           0
Sex                              0
Age                              0
                                ..
NaN(110465)(Ticket)_Ticket       0
NaN(S)(Embarked)_Embarked        0
NaN(C)(Embarked)_Embarked        0
NaN(Q)(Embarked)_Embarked        0
WordToNumber()(Ticket)_Ticket    0
Length: 1030, dtype: int64

In [117]:
wsampler.weights / 2

PassengerId                      0.000000
Survived                         0.000000
Pclass                           0.000000
Sex                              0.000000
Age                              0.398178
                                   ...   
NaN(110465)(Ticket)_Ticket       0.053321
NaN(S)(Embarked)_Embarked        0.497770
NaN(C)(Embarked)_Embarked        0.395074
NaN(Q)(Embarked)_Embarked        0.336122
WordToNumber()(Ticket)_Ticket    0.418327
Length: 1030, dtype: float64

In [14]:
ssampler = SmartColumnSampler(titanic_new)
ssampler.smart_weights

PassengerId                      0
Survived                         0
Pclass                           0
Name                             0
Sex                              0
                                ..
NaN(111426)(Ticket)_Ticket       0
NaN(19996)(Ticket)_Ticket        0
NaN(111369)(Ticket)_Ticket       0
NaN(C111)(Cabin)_Cabin           0
WordToNumber()(Ticket)_Ticket    0
Length: 974, dtype: int64
PassengerId                      0.000000
Survived                         0.000000
Pclass                           0.000000
Name                             0.000000
Sex                              0.000000
                                   ...   
NaN(111426)(Ticket)_Ticket       0.000000
NaN(19996)(Ticket)_Ticket        0.102065
NaN(111369)(Ticket)_Ticket       0.000000
NaN(C111)(Cabin)_Cabin           0.962093
WordToNumber()(Ticket)_Ticket    0.800751
Length: 974, dtype: float64


PassengerId                      0.000000
Survived                         0.000000
Pclass                           0.000000
Name                             0.000000
Sex                              0.000000
                                   ...   
NaN(111426)(Ticket)_Ticket       0.000000
NaN(19996)(Ticket)_Ticket        0.051033
NaN(111369)(Ticket)_Ticket       0.000000
NaN(C111)(Cabin)_Cabin           0.481047
WordToNumber()(Ticket)_Ticket    0.400376
Length: 974, dtype: float64

In [13]:
titanic_new.Survived[4]

0

In [14]:
titanic_mercs, nominal_attributes = to_mercs(titanic_new)
m_codes = make_m_codes(titanic_new, "Survived")
m_codes

array([[0., 1., 0., ..., 0., 0., 0.]])

In [11]:
cfg_mercs_stump = dict(
    # Induction
    max_depth=1,
    selection_algorithm="default",
    nb_targets=1,
    nb_iterations=1,
    n_jobs=1,
    # Inference
    inference_algorithm="own",
    prediction_algorithm="mi",
    max_steps=8,
    # Metadata
    nominal_attributes=nominal_attributes,
)

cfg_mercs_default = dict(
    # Induction
    max_depth=8,
    selection_algorithm="default",
    nb_targets=1,
    nb_iterations=1,
    n_jobs=1,
    # Inference
    inference_algorithm="own",
    prediction_algorithm="mi",
    max_steps=8,
    # Metadata
    nominal_attributes=nominal_attributes,
)

In [None]:
s = time.time()

# Train a MERCS-model with config above
clf = Mercs(**cfg_mercs_default)

# Nominal attributes has to be given at fit, all the rest can be done before
clf.fit(titanic_mercs,
        nominal_attributes=nominal_attributes,
        m_codes=m_codes)

e = time.time()
e - s

In [73]:
s = time.time()

# Train a MERCS-model with config above
clf = Mercs(**cfg_mercs_stump)

# Nominal attributes has to be given at fit, all the rest can be done before
clf.fit(titanic_mercs, nominal_attributes=nominal_attributes)

e = time.time()
e - s


        Only 0 samples available for training.
        min_nb_samples is set to 10.
        Therefore no training occured.
        


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [75]:
print(titanic_mercs)

[[1.00000e+00 0.00000e+00 3.00000e+00 ... 1.00000e+00 1.00000e+00
          nan]
 [2.00000e+00 1.00000e+00 1.00000e+00 ...         nan 0.00000e+00
          nan]
 [3.00000e+00 1.00000e+00 3.00000e+00 ... 1.00000e+00 1.00000e+00
          nan]
 ...
 [8.89000e+02 0.00000e+00 3.00000e+00 ... 1.00000e+00 1.00000e+00
          nan]
 [8.90000e+02 1.00000e+00 1.00000e+00 ...         nan 0.00000e+00
  1.11369e+05]
 [8.91000e+02 0.00000e+00 3.00000e+00 ... 0.00000e+00         nan
  3.70376e+05]]
