#### Imports


In [2]:
from zoish.feature_selectors.feature_selectors import ShapFeatureSelector
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer

#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [3]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]

data = pd.read_csv(urldata,names=col_names)
data.head()


Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [4]:


data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

#### Train test split


In [5]:
X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)

#### Define feature selector step 


In [6]:
shap_feature_selector_factory = (
    ShapFeatureSelector.shap_feature_selector_factory.set_model_params(
        X=X_train,
        y=y_train,
        verbose=0,
        random_state=0,
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [4, 5],
        },
        method="gridsearch",
        n_features=5,
        cut_of = None,
        list_of_obligatory_features_that_must_be_in_model=["defect in node"],
        list_of_features_to_drop_before_any_selection=["bl. of lymph. c"],
    )
    .set_shap_params(
        model_output="raw",
        feature_perturbation="interventional",
        algorithm="v2",
        shap_n_jobs=-1,
        memory_tolerance=-1,
        feature_names=None,
        approximate=False,
        shortcut=False,
    )
    .set_gridsearchcv_params(
        measure_of_accuracy=make_scorer(f1_score, greater_is_better=True, average='macro'),
        verbose=0,
        n_jobs=-1,
        cv=KFold(2),
    )
)

#### Find feature type for later use


In [7]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Define pipeline

In [8]:
pipeline = Pipeline(
    [
        # int missing values imputers
        (
            "intimputer",
            MeanMedianImputer(imputation_method="median", variables=int_cols),
        ),
        ("sfsf", shap_feature_selector_factory),
        # classification model
        ("logistic", LogisticRegression()),
    ]
)

#### Run Pipeline


In [9]:
pipeline.fit(X_train, y_train.values.ravel())
y_pred = pipeline.predict(X_test)


Building Best Estimator
                  0                                                  1
0        lymphatics  [0.2520607202040444, 0.1458518424614141, 0.235...
1   block of affere  [0.10655192131088616, 0.08087731446017737, 0.1...
2   bl. of lymph. c  [0.0, 0.0048165648469791775, 0.004563329406488...
3   bl. of lymph. s  [0.3640460234827288, 0.06843419038361111, 0.35...
4           by pass  [0.19803925344705653, 0.20754841122146475, 0.0...
5      extravasates   [0.0, 0.09833088283457457, 0.007779867161579489]
6   regeneration of  [0.12402549506548245, 0.08728508520392131, 0.1...
7   early uptake in    [0.09216977365799979, 0.04908975426161517, 0.0]
8   lym.nodes dimin  [0.5202825567235538, 0.20056062756644952, 0.32...
9   lym.nodes enlar  [0.24750754111067125, 0.5159602719673956, 0.25...
10  changes in lym.  [0.43591483647980095, 0.554517269601627, 0.258...
11   defect in node  [0.020108941741427695, 0.34144967942838406, 0....
12  changes in node  [0.5382229842283089, 0.881209564

#### Check performance of the Pipeline


In [10]:

print("F1 score : ")
print(f1_score(y_test, y_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_pred))



F1 score : 
0.42857142857142855
Classification report : 
              precision    recall  f1-score   support

           0       0.45      0.62      0.52        21
           1       0.40      0.43      0.41        14
           2       0.40      0.14      0.21        14

    accuracy                           0.43        49
   macro avg       0.42      0.40      0.38        49
weighted avg       0.42      0.43      0.40        49

Confusion matrix : 
[[13  6  2]
 [ 7  6  1]
 [ 9  3  2]]
