#### Imports


In [1]:
from zoish.feature_selectors.recursive_feature_addition import RecursiveFeatureAdditionFeatureSelector
import xgboost
from sklearn.model_selection import KFold, train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score,make_scorer
from feature_engine.imputation import MeanMedianImputer


In this module the default logging will be applied.


#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]

data = pd.read_csv(urldata,names=col_names)
data.head()


Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [3]:


data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

#### Train test split


In [4]:
X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.50,  random_state=42
)
y_test=y_test.values.ravel()
y_train=y_train.values.ravel()


#### Define feature selector step 


In [5]:

recursive_addition_feature_selector_factory = (
    RecursiveFeatureAdditionFeatureSelector.recursive_addition_feature_selector_factory.set_model_params(
        X=X_train,
        y=y_train,
        verbose=0,
        random_state=0,
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [4, 5],
            "gamma":[0.0,1.0],
            "learning_rate":[0.01,0.1]
        },
        fit_params = {
            "eval_set":None, 
            "eval_metric":None,
        },
        method="gridsearch",
        threshold=0.01,
        list_of_obligatory_features_that_must_be_in_model=[],
        list_of_features_to_drop_before_any_selection=[],
    ).set_recursive_addition_feature_params(
        cv=2,
        variables=None,
        scoring='roc_auc_ovr',
        confirm_variables=False,
        
    )
    .set_gridsearchcv_params(
        measure_of_accuracy=make_scorer(f1_score, greater_is_better=True, average='micro'),
        verbose=10,
        n_jobs=-1,
        cv=KFold(3),
    )
)

#### Find feature type for later use


In [6]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Define pipeline

In [7]:
pipeline = Pipeline(
    [
        # int missing values imputers
        (
            "intimputer",
            MeanMedianImputer(imputation_method="median", variables=int_cols),
        ),
        ("rafsf", recursive_addition_feature_selector_factory),
        # classification model
        ("logistic", LogisticRegression(solver='liblinear',max_iter=100)),
    ]
)

#### Run Pipeline


In [8]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3; 1/8] START gamma=0.0, learning_rate=0.01, max_depth=4..................
[CV 1/3; 1/8] END gamma=0.0, learning_rate=0.01, max_depth=4;, score=0.520 total time=   0.6s
[CV 2/3; 1/8] START gamma=0.0, learning_rate=0.01, max_depth=4..................
[CV 2/3; 1/8] END gamma=0.0, learning_rate=0.01, max_depth=4;, score=0.480 total time=   0.7s
[CV 3/3; 1/8] START gamma=0.0, learning_rate=0.01, max_depth=4..................
[CV 3/3; 1/8] END gamma=0.0, learning_rate=0.01, max_depth=4;, score=0.375 total time=   0.7s
[CV 1/3; 2/8] START gamma=0.0, learning_rate=0.01, max_depth=5..................
[CV 1/3; 2/8] END gamma=0.0, learning_rate=0.01, max_depth=5;, score=0.520 total time=   0.7s
[CV 2/3; 2/8] START gamma=0.0, learning_rate=0.01, max_depth=5..................
[CV 2/3; 2/8] END gamma=0.0, learning_rate=0.01, max_depth=5;, score=0.440 total time=   0.6s
[CV 3/3; 2/8] START gamma=0.0, learning_rate=0.01, max_depth=5...

#### Check performance of the Pipeline


In [9]:

print("Confusion matrix : ")
print(confusion_matrix(y_test, y_pred))



Confusion matrix : 
[[26  5  0]
 [19  4  0]
 [20  0  0]]


#### Get access to feature selector instance

In [10]:
print(RecursiveFeatureAdditionFeatureSelector.recursive_addition_feature_selector_factory.get_feature_selector_instance())

RecursiveFeatureAddition(cv=KFold(n_splits=3, random_state=None, shuffle=False),
                         estimator=XGBClassifier(base_score=0.5,
                                                 booster='gbtree',
                                                 callbacks=None,
                                                 colsample_bylevel=1,
                                                 colsample_bynode=1,
                                                 colsample_bytree=1,
                                                 early_stopping_rounds=None,
                                                 enable_categorical=False,
                                                 eval_metric=None, gamma=1.0,
                                                 gpu_id=-1,
                                                 grow_policy='depthwise',
                                                 importance_type=None,
                                                 interaction_constraints='',
  