# Import Libraries

In [1]:
from generate_synthetic_data import *

import sklearn.ensemble as Ensemble
import sklearn.tree as Tree
import pandas as pd
import numpy as np

from feature_importance_factory import FeatureImportanceFactory
from feature_importance_controller import FeatureImportanceController
from feature_importance_strategy import FeatureImportanceStrategy

from feature_importance_mdi import FeatureImportanceMDI
from feature_importance_mda import FeatureImportanceMDA
from clustered_feature_importance_mdi import ClusteredFeatureImportanceMDI
from clustered_feature_importance_mda import ClusteredFeatureImportanceMDA


# Generate Synthetic Test Data

In [2]:
X, y = get_test_dataset(
    n_features=40,
    n_informative=5, 
    n_redundant=30,
    n_samples=10000,
    sigma_std=0.1
)

In [3]:
X.columns

Index(['I_0', 'I_1', 'I_2', 'I_3', 'I_4', 'N_0', 'N_1', 'N_2', 'N_3', 'N_4',
       'R_0', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9',
       'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18',
       'R_19', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26', 'R_27',
       'R_28', 'R_29'],
      dtype='object')

# MDI

In [4]:
classifier = Ensemble.RandomForestClassifier(
    n_estimators=10,
    max_features=1.0,
    max_samples=1.0,
    oob_score=False,
)

strategy = FeatureImportanceMDI(
    classifier, x=X, y=y,
)

results = FeatureImportanceFactory().\
        build(strategy).\
        get()

results

Unnamed: 0,Mean,StandardDeviation
I_0,0.006018,0.001564
I_1,0.085116,0.039256
I_2,0.121092,0.013242
I_3,0.004413,0.000905
I_4,0.041678,0.014187
N_0,0.008484,0.00076
N_1,0.010324,0.000789
N_2,0.009247,0.000749
N_3,0.009024,0.000767
N_4,0.008622,0.000722


# MDA

In [5]:
from feature_importance_mda import FeatureImportanceMDA

classifier = Ensemble.RandomForestClassifier(
    n_estimators=10,
    max_features=1.0,
    max_samples=1.0,
    oob_score=False,
)

strategy = FeatureImportanceMDA(
    classifier, x=X, y=y, n_splits=5, 
)

results = FeatureImportanceFactory().\
        build(strategy).\
        get()

results



Fold 0 start ...
Fold 1 start ...
Fold 2 start ...
Fold 3 start ...
Fold 4 start ...


Unnamed: 0,Mean,StandardDeviation
I_0,-0.071576,0.026545
I_1,-0.524957,0.065625
I_2,-0.130656,0.126695
I_3,-0.010264,0.024802
I_4,-0.174523,0.04283
N_0,-0.021184,0.018419
N_1,-0.015811,0.004653
N_2,-0.01547,0.014726
N_3,-0.013066,0.011061
N_4,-0.022642,0.010627


# SFI

In [6]:
from feature_importance_sfi import FeatureImportanceSFI

classifier = Ensemble.RandomForestClassifier(
    n_estimators=10,
    max_features=1.0,
    max_samples=1.0,
    oob_score=False,
)

strategy = FeatureImportanceSFI(
    classifier, x=X, y=y, n_splits=5, 
)

results = FeatureImportanceFactory().\
        build(strategy).\
        get()

results



  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.append(
  importances = importances.

Unnamed: 0,FeatureName,Mean,StandardDeviation
0,I_0,-8.234371,1.916671
1,I_1,-5.72725,0.290894
2,I_2,-7.989087,2.111712
3,I_3,-7.968208,0.827692
4,I_4,-8.111502,0.92564
5,N_0,-8.257195,1.039456
6,N_1,-8.011069,0.840469
7,N_2,-8.083236,0.860715
8,N_3,-8.163473,1.156772
9,N_4,-8.245054,0.771218


# Clustered MDI

In [7]:
import warnings
from clustering import cluster_kmeans_base
warnings.filterwarnings('ignore')

corr0, clusters, silh = cluster_kmeans_base(
    X.corr(),
    number_clusters=25,
    iterations=20
)

In [8]:
classifier = Ensemble.RandomForestClassifier(
    n_estimators=10,
    max_features=1.0,
    max_samples=1.0,
    oob_score=False,
)

strategy = ClusteredFeatureImportanceMDI(
    classifier, clusters=clusters, x=X, y=y, 
)

results = FeatureImportanceFactory().\
        build(strategy).\
        get()

results

Unnamed: 0,Mean,StandardDeviation
C_0,0.096197,0.00239
C_1,0.046431,0.001962
C_2,0.110366,0.004264
C_3,0.355468,0.002147
C_4,0.174482,0.003277
C_5,0.217056,0.003357


# Clustered MDA

In [9]:
import warnings
from clustering import cluster_kmeans_base
warnings.filterwarnings('ignore')

corr0, clusters, silh = cluster_kmeans_base(
    X.corr(),
    number_clusters=25,
    iterations=20
)

In [10]:
classifier = Ensemble.RandomForestClassifier(
    n_estimators=10,
    max_features=1.0,
    max_samples=1.0,
    oob_score=False,
)

strategy = ClusteredFeatureImportanceMDI(
    classifier, clusters=clusters, x=X, y=y, 
)

results = FeatureImportanceFactory().\
        build(strategy).\
        get()

results

Unnamed: 0,Mean,StandardDeviation
C_0,0.092062,0.003141
C_1,0.109309,0.002859
C_2,0.35822,0.003222
C_3,0.17584,0.0034
C_4,0.212848,0.003097
C_5,0.051721,0.001781


# Orthogonal

In [16]:
from orthogonal_features import orthogonal_features

orthogonal_features, eigen_dataframe = orthogonal_features(
    X,
    variance_threshold=0.95
)

orthogonal_features

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.966873,0.036157,0.173458,1.077774,-1.503938,0.130776,0.668909,-2.058400,-1.006863
1,-0.942325,0.077657,-0.353651,0.501068,-0.202194,0.613058,0.346068,-1.031550,0.131250
2,-0.716439,-0.100930,1.283325,0.848877,-0.707392,-0.737716,0.302418,-0.792607,0.121775
3,-1.134535,-0.250868,-0.106080,-0.126013,-2.322576,-0.998544,-0.297545,-0.326369,0.363300
4,-0.329927,1.214664,-0.432226,-0.428989,-0.210331,-0.518758,-0.687217,-0.505478,-0.652000
...,...,...,...,...,...,...,...,...,...
9995,1.376160,1.152348,1.887187,-0.651099,2.773032,1.152963,-1.127660,0.659086,0.587282
9996,1.462891,0.338382,0.231409,-0.444821,0.675293,-1.244900,-0.090487,1.449306,0.378862
9997,-0.418783,-1.451355,0.065706,0.890295,-1.736130,-0.795710,1.396982,0.312831,0.852758
9998,0.314944,-0.312340,1.215733,-0.166766,0.009004,0.138418,-0.108078,0.610226,1.498948


In [18]:
eigen_dataframe

Unnamed: 0,Index,EigenValue,EigenVector,CumulativeVariance
39,PC 40,109727.227777,"[-0.00025334576145934765, -0.00237133515422773...",0.274346
38,PC 39,100434.654131,"[-0.004061988096094889, -0.002819233026706929,...",0.525457
37,PC 38,67438.681324,"[-0.012750920515012032, -0.15538198082887494, ...",0.694071
36,PC 37,45755.859659,"[-0.11817532658153271, 0.013753531931629153, -...",0.808472
35,PC 36,25059.321881,"[-0.0007120315058640236, -0.000790528582073290...",0.871126
34,PC 35,10281.068949,"[0.0025267470907618847, -0.0012484949518017825...",0.896832
33,PC 34,10158.207601,"[-0.12141511085305613, 0.007933152646664545, -...",0.92223
32,PC 33,9969.954282,"[-0.0008416811806972855, 0.0034413089304787827...",0.947157
31,PC 32,9885.702847,"[-0.11697775455102725, 0.006973184401419617, -...",0.971874
