#### Imports


In [1]:
from zoish.feature_selectors.shap_selectors import ShapFeatureSelector
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer

In this module, the default logging will be applied. The error is [Errno 2] No such file or directory: 'zoish/config.yaml' which will be skipped!
2023-02-24 18:25:16,446 :: graphviz._tools :: deprecate positional args: graphviz.backend.piping.pipe(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-02-24 18:25:16,448 :: graphviz._tools :: deprecate positional args: graphviz.backend.rendering.render(['renderer', 'formatter', 'neato_no_op', 'quiet'])
2023-02-24 18:25:16,452 :: graphviz._tools :: deprecate positional args: graphviz.backend.unflattening.unflatten(['stagger', 'fanout', 'chain', 'encoding'])
2023-02-24 18:25:16,454 :: graphviz._tools :: deprecate positional args: graphviz.backend.viewing.view(['quiet'])
2023-02-24 18:25:16,462 :: graphviz._tools :: deprecate positional args: graphviz.quoting.quote(['is_html_string', 'is_valid_id', 'dot_keywords', 'endswith_odd_number_of_backslashes', 'escape_unescaped_quotes'])
2023-02-24 18:25:16,463 :: graphviz._tools :: deprecate posi

#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]

data = pd.read_csv(urldata,names=col_names)
data.head()


Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [3]:


data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

#### Train test split


In [4]:
X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)

#### Define feature selector step 


In [5]:
shap_feature_selector_factory = (
    ShapFeatureSelector.shap_feature_selector_factory.set_model_params(
        X=X_train,
        y=y_train,
        verbose=0,
        random_state=0,
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [4, 5],
        },
        fit_params = {
            "sample_weight": None,
        },
        method="gridsearch",
        # if n_features=None only the threshold will be considered as a cut-off of features grades.
        # if threshold=None only n_features will be considered to select the top n features.
        # if both of them are set to some values, the threshold has the priority for selecting features.
        n_features=3,
        threshold = 0.4,
        list_of_obligatory_features_that_must_be_in_model=["defect in node"],
        list_of_features_to_drop_before_any_selection=["bl. of lymph. c"],
    )
    .set_shap_params(
        model_output="raw",
        feature_perturbation="interventional",
        algorithm="v2",
        shap_n_jobs=-1,
        memory_tolerance=-1,
        feature_names=None,
        approximate=False,
        shortcut=False,
    )
    .set_gridsearchcv_params(
        measure_of_accuracy=make_scorer(f1_score, greater_is_better=True, average='macro'),
        verbose=10,
        n_jobs=-1,
        cv=KFold(2),
    )
)

#### Find feature type for later use


In [6]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Define pipeline

In [7]:
pipeline = Pipeline(
    [
        # int missing values imputers
        (
            "intimputer",
            MeanMedianImputer(imputation_method="median", variables=int_cols),
        ),
        ("sfsf", shap_feature_selector_factory),
        # classification model
        ("logistic", LogisticRegression()),
    ]
)

#### Run Pipeline


In [8]:
pipeline.fit(X_train, y_train.values.ravel())
y_pred = pipeline.predict(X_test)


2023-02-24 18:25:19,411 :: root :: Building Best Estimator by GridSearchCV !
2023-02-24 18:25:19,413 :: dev :: The optimization will be based on make_scorer(f1_score, average=macro) metric!
2023-02-24 18:25:19,413 :: dev :: The optimization will be based on make_scorer(f1_score, average=macro) metric!
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START max_depth=4.................................................
[CV 1/2; 1/2] END ..................max_depth=4;, score=0.461 total time=   0.3s
[CV 2/2; 1/2] START max_depth=4.................................................
[CV 2/2; 1/2] END ..................max_depth=4;, score=0.374 total time=   0.3s
[CV 1/2; 2/2] START max_depth=5.................................................
[CV 1/2; 2/2] END ..................max_depth=5;, score=0.427 total time=   0.3s
[CV 2/2; 2/2] START max_depth=5.................................................
[CV 2/2; 2/2] END ..................max_depth=5;, score=0.423 total tim

#### Check performance of the Pipeline


In [9]:

print("F1 score : ")
print(f1_score(y_test, y_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_pred))



F1 score : 
0.3877551020408163
Classification report : 
              precision    recall  f1-score   support

           0       0.45      0.71      0.56        21
           1       0.27      0.29      0.28        14
           2       0.00      0.00      0.00        14

    accuracy                           0.39        49
   macro avg       0.24      0.33      0.28        49
weighted avg       0.27      0.39      0.32        49

Confusion matrix : 
[[15  6  0]
 [ 9  4  1]
 [ 9  5  0]]


#### Get access to feature selector instance

In [10]:
print(ShapFeatureSelector.shap_feature_selector_factory.get_feature_selector_instance())

<fasttreeshap.explainers._tree.Tree object at 0x1092c02b0>
