#### Imports


In [1]:
from zoish.feature_selectors.recursive_feature_elimination import RecursiveFeatureEliminationFeatureSelector
import xgboost
from sklearn.model_selection import KFold, train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score,make_scorer
from feature_engine.imputation import MeanMedianImputer


2023-02-24 21:22:04,546 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/zoish/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-02-24 21:22:04,555 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-02-24 21:22:04,561 :: matplotlib :: interactive is False
2023-02-24 21:22:04,562 :: matplotlib :: platform is darwin
2023-02-24 21:22:04,763 :: matplotlib :: CACHEDIR=/Users/hjavedani/.matplotlib
2023-02-24 21:22:04,768 :: matplotlib.font_manager :: Using fontManager instance from /Users/hjavedani/.matplotlib/fontlist-v330.json
2023-02-24 21:22:09,022 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-02-24 21:22:09,026 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-02-24 21:22:09,032 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger'

#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]

data = pd.read_csv(urldata,names=col_names)
data.head()


Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [3]:


data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

#### Train test split


In [4]:
X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)
y_test=y_test.values.ravel()
y_train=y_train.values.ravel()


#### Define feature selector step 


In [5]:

recursive_feature_elimination_feature_selector = (
    RecursiveFeatureEliminationFeatureSelector.recursive_elimination_feature_selector_factory.set_model_params(
        X=X_train,
        y=y_train,
        verbose=0,
        random_state=0,
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [4, 5],
        },
        fit_params = {
            "sample_weight": None,
        },
        method="tunegridsearch",
        threshold=0.02,
        list_of_obligatory_features_that_must_be_in_model=[],
        list_of_features_to_drop_before_any_selection=[],
    ).set_recursive_elimination_feature_params(
        cv=2,
        variables=None,
        scoring='roc_auc_ovr',
        confirm_variables=False,
        
    )
    .set_tunegridsearchcv_params(
        verbose=1,
        n_jobs=None,
        cv=3,
        early_stopping=None, 
        # if scoring is None, then measure_of_accuracy will be applied
        scoring=None, 
        refit=True, 
        error_score='raise', 
        return_train_score=False, 
        local_dir='~/ray_results', 
        name=None, 
        max_iters=1, 
        use_gpu=False, 
        loggers=None, 
        pipeline_auto_early_stop=True, 
        stopper=None, 
        time_budget_s=None, 
        measure_of_accuracy=make_scorer(f1_score, greater_is_better=True, average='macro'),
        mode=None

)
)

#### Find feature type for later use


In [6]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Define pipeline

In [7]:
pipeline = Pipeline(
    [
        # int missing values imputers
        (
        "intimputer",
        MeanMedianImputer(imputation_method="median", variables=int_cols),
        ),
        ("sfefs", recursive_feature_elimination_feature_selector),
        # classification model
        ("logistic", LogisticRegression(solver='liblinear',max_iter=100)),
    ]
)

#### Run Pipeline


In [8]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


0,1
Current time:,2023-02-24 21:22:52
Running for:,00:00:25.83
Memory:,17.4/32.0 GiB

Trial name,status,loc,max_depth,iter,total time (s),split0_test_score,split1_test_score,split2_test_score
_Trainable_6350e_00000,TERMINATED,127.0.0.1:74391,4,1,0.246514,0.393939,0.393939,0.545455
_Trainable_6350e_00001,TERMINATED,127.0.0.1:74399,5,1,0.262734,0.484848,0.424242,0.545455


2023-02-24 21:22:52,588	INFO tune.py:762 -- Total run time: 27.50 seconds (25.81 seconds for the tuning loop).


2023-02-24 21:22:53,371 :: dev :: The optimization will be based on make_scorer(f1_score, average=macro) metric!
2023-02-24 21:22:53,371 :: dev :: The optimization will be based on make_scorer(f1_score, average=macro) metric!
['bl. of lymph. c', 'early uptake in', 'defect in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']
{'exclusion of no': -0.04040404040404033, 'by pass': 0.03030303030303022, 'extravasates': 0.03030303030303022, 'lymphatics': 0.07070707070707061, 'lym.nodes enlar': 0.05050505050505044, 'no. of nodes in': 0.0, 'defect in node': -0.02020202020202022, 'special forms': 0.0, 'changes in node': 0.04040404040404033, 'bl. of lymph. c': 0.010101010101010055, 'dislocation of': 0.010101010101010166, 'block of affere': 0.0606060606060605, 'changes in lym.': 0.05050505050505044, 'regeneration of': 0.04040404040404033, 'lym.nodes dimin': 0.08080808080808077, 'bl. of lymph. s': 0.07070707070707061, 'early uptake in': 0.010101010101010055}
dict_keys([

#### Check performance of the Pipeline


In [9]:

print("F1 score : ")
print(f1_score(y_test, y_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_pred))



F1 score : 
0.46938775510204084
Classification report : 
              precision    recall  f1-score   support

           0       0.47      0.81      0.60        21
           1       0.45      0.36      0.40        14
           2       0.50      0.07      0.12        14

    accuracy                           0.47        49
   macro avg       0.48      0.41      0.37        49
weighted avg       0.48      0.47      0.41        49

Confusion matrix : 
[[17  4  0]
 [ 8  5  1]
 [11  2  1]]


#### Get access to feature selector instance

In [10]:
print(RecursiveFeatureEliminationFeatureSelector.recursive_elimination_feature_selector_factory.get_feature_selector_instance())

RecursiveFeatureElimination(estimator=XGBClassifier(base_score=0.5,
                                                    booster='gbtree',
                                                    callbacks=None,
                                                    colsample_bylevel=1,
                                                    colsample_bynode=1,
                                                    colsample_bytree=1,
                                                    early_stopping_rounds=None,
                                                    enable_categorical=False,
                                                    eval_metric=None, gamma=0,
                                                    gpu_id=-1,
                                                    grow_policy='depthwise',
                                                    importance_type=None,
                                                    interaction_constraints='',
                                              