#### Imports


In [1]:
from zoish.feature_selectors.single_feature_selectors import SingleFeaturePerformanceFeatureSelector
import xgboost
from sklearn.model_selection import KFold, train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score,make_scorer
from feature_engine.imputation import  MeanMedianImputer


In this module the default logging will be applied.


#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"
urlname = "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.names"
# column names
col_names = [
    "class",
    "lymphatics",
    "block of affere",
    "bl. of lymph. c",
    "bl. of lymph. s",
    "by pass",
    "extravasates",
    "regeneration of",
    "early uptake in",
    "lym.nodes dimin",
    "lym.nodes enlar",
    "changes in lym.",
    "defect in node",
    "changes in node",
    "special forms",
    "dislocation of",
    "exclusion of no",
    "no. of nodes in",

]

data = pd.read_csv(urldata,names=col_names)
data.head()


Unnamed: 0,class,lymphatics,block of affere,bl. of lymph. c,bl. of lymph. s,by pass,extravasates,regeneration of,early uptake in,lym.nodes dimin,lym.nodes enlar,changes in lym.,defect in node,changes in node,special forms,dislocation of,exclusion of no,no. of nodes in
3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1


#### Define labels and train-test split


In [3]:


data.loc[(data["class"] == 1) | (data["class"] == 2), "class"] = 0
data.loc[data["class"] == 3, "class"] = 1
data.loc[data["class"] == 4, "class"] = 2
data["class"] = data["class"].astype(int)

#### Train test split


In [4]:
X = data.loc[:, data.columns != "class"]
y = data.loc[:, data.columns == "class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)

#### Define feature selector step 


In [5]:

single_feature_performance_feature_selector_factory = (
    SingleFeaturePerformanceFeatureSelector.single_feature_performance_feature_selector_factory.set_model_params(
        X=X_train,
        y=y_train,
        verbose=0,
        random_state=0,
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [4, 5],
        },
        fit_params = {
            "sample_weight": None,
        },
        method="gridsearch",
        n_features=5,
        threshold=0.3,
        list_of_obligatory_features_that_must_be_in_model=[],
        list_of_features_to_drop_before_any_selection=[],
    )
    .set_single_feature_params(
        cv=3,
        variables=None,
        scoring='roc_auc_ovr',
        confirm_variables=False,
        
    )
    .set_gridsearchcv_params(
        measure_of_accuracy=make_scorer(f1_score, greater_is_better=True, average='macro'),
        verbose=10,
        n_jobs=-1,
        cv=KFold(3),
    )
)

#### Find feature type for later use


In [6]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
print(int_cols)

['lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in']


#### Define pipeline

In [7]:
pipeline = Pipeline(
    [
        # int missing values imputers
        (
            "intimputer",
            MeanMedianImputer(imputation_method="median", variables=int_cols),
        ),
        ("sfpfsf", single_feature_performance_feature_selector_factory),
        # classification model
        ("logistic", LogisticRegression(solver='liblinear',max_iter=100)),
    ]
)

#### Run Pipeline


In [8]:
pipeline.fit(X_train, y_train.values.ravel())
y_pred = pipeline.predict(X_test)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3; 1/2] START max_depth=4.................................................
[CV 1/3; 1/2] END ..................max_depth=4;, score=0.475 total time=   0.4s
[CV 2/3; 1/2] START max_depth=4.................................................
[CV 2/3; 1/2] END ..................max_depth=4;, score=0.259 total time=   0.5s
[CV 3/3; 1/2] START max_depth=4.................................................
[CV 3/3; 1/2] END ..................max_depth=4;, score=0.371 total time=   0.3s
[CV 1/3; 2/2] START max_depth=5.................................................
[CV 1/3; 2/2] END ..................max_depth=5;, score=0.496 total time=   0.3s
[CV 2/3; 2/2] START max_depth=5.................................................
[CV 2/3; 2/2] END ..................max_depth=5;, score=0.209 total time=   0.3s
[CV 3/3; 2/2] START max_depth=5.................................................
[CV 3/3; 2/2] END ..................max_depth=5;, 

#### Check performance of the Pipeline


In [9]:

print("F1 score : ")
print(f1_score(y_test, y_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_pred))



F1 score : 
0.36734693877551017
Classification report : 
              precision    recall  f1-score   support

           0       0.43      0.62      0.51        21
           1       0.29      0.36      0.32        14
           2       0.00      0.00      0.00        14

    accuracy                           0.37        49
   macro avg       0.24      0.33      0.28        49
weighted avg       0.27      0.37      0.31        49

Confusion matrix : 
[[13  8  0]
 [ 7  5  2]
 [10  4  0]]


#### Get access to feature selector instance

In [10]:
print(SingleFeaturePerformanceFeatureSelector.single_feature_performance_feature_selector_factory.get_feature_selector_instance())

SelectBySingleFeaturePerformance(cv=KFold(n_splits=3, random_state=None, shuffle=False),
                                 estimator=XGBClassifier(base_score=0.5,
                                                         booster='gbtree',
                                                         callbacks=None,
                                                         colsample_bylevel=1,
                                                         colsample_bynode=1,
                                                         colsample_bytree=1,
                                                         early_stopping_rounds=None,
                                                         enable_categorical=False,
                                                         eval_metric=None,
                                                         gamma=0, gpu_id=-1,
                                                         grow_policy='depthwise',
                                                        

#### Get list of features and grades

In [11]:
SingleFeaturePerformanceFeatureSelector.single_feature_performance_feature_selector_factory.get_info_of_features_and_grades()

list of selected features+list of obligatory features that must be in                 model-list of features to drop before any selection               ['dislocation of', 'defect in node', 'bl. of lymph. c', 'lym.nodes dimin', 'block of affere', 'extravasates', 'early uptake in', 'by pass', 'lymphatics', 'regeneration of', 'no. of nodes in', 'changes in node', 'exclusion of no', 'bl. of lymph. s', 'lym.nodes enlar', 'changes in lym.', 'special forms']
list of selected features and their grades
---------------------------------------------------------
        column_name feature_importance
12  changes in node           0.665388
3   bl. of lymph. s           0.581585
14   dislocation of           0.578844
8   lym.nodes dimin           0.576275
9   lym.nodes enlar           0.526024
6   regeneration of           0.522119
4           by pass           0.518349
5      extravasates           0.513608
2   bl. of lymph. c           0.506623
7   early uptake in            0.50514
10  changes in