### Imports

In [1]:
from zoish.feature_selectors.feature_selectors import SingleFeaturePerformanceFeatureSelector
import xgboost
from sklearn.model_selection import KFold, train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score,make_scorer
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from category_encoders import OrdinalEncoder


#### Example: Use Adult Data Set (a classification problem)
###### https://archive.ics.uci.edu/ml/datasets/Adult


#### Read data

In [2]:
urldata = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "label",
]
# read data
data = pd.read_csv(urldata, header=None, names=col_names, sep=",")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Define labels and train-test split


In [3]:
data.loc[data["label"] == "<=50K", "label"] = 0
data.loc[data["label"] == " <=50K", "label"] = 0

data.loc[data["label"] == ">50K", "label"] = 1
data.loc[data["label"] == " >50K", "label"] = 1

data["label"] = data["label"].astype(int)

# # Train test split

X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y["label"], random_state=42
)


#### Define feature selector step 

In [4]:

single_feature_performance_feature_selector_factory = (
    SingleFeaturePerformanceFeatureSelector.single_feature_performance_feature_selector_factory.set_model_params(
        X=X_train,
        y=y_train,
        verbose=0,
        random_state=0,
        estimator=xgboost.XGBClassifier(),
        estimator_params={
            "max_depth": [4, 5],
        },
        method="randomsearch",
        n_features=5,
        list_of_obligatory_features_that_must_be_in_model=[],
        list_of_features_to_drop_before_any_selection=[],
    )
    .set_single_feature_params(
        threshold=0.6,
        cv=3,
        variables=None,
        scoring='roc_auc',
        
    )
    .set_randomsearchcv_params(
        measure_of_accuracy=make_scorer(f1_score, greater_is_better=True, average='macro'),
        verbose=0,
        n_jobs=-1,
        cv=KFold(5),
        n_iter=3,
    )
)

#### Find feature type for later use


In [5]:
int_cols = X_train.select_dtypes(include=["int"]).columns.tolist()
float_cols = X_train.select_dtypes(include=["float"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()


#### Define pipeline

In [6]:
pipeline = Pipeline(
    [
        # int missing values imputers
        (
            "intimputer",
            MeanMedianImputer(imputation_method="median", variables=int_cols),
        ),
        # category missing values imputers
        ("catimputer", CategoricalImputer(variables=cat_cols)),
        #
        ("catencoder", OrdinalEncoder()),
        # feature selection
        ("sfpfsf", single_feature_performance_feature_selector_factory),
        # classification model
        ("SVC", SVC()),
    ]
)

#### Run Pipeline

In [7]:
pipeline.fit(X_train, y_train.values.ravel())
y_pred = pipeline.predict(X_test)

Building Best Estimator


The total space of parameters 2 is smaller than n_iter=3. Running 2 iterations. For exhaustive searches, use GridSearchCV.
The total space of parameters 2 is smaller than n_iter=3. Running 2 iterations. For exhaustive searches, use GridSearchCV.
Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "/Users/hjavedani/Documents/zoish/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 358, in _score
    y_pred = method_caller(clf, "decision_function", X)
  File "/Users/hjavedani/Documents/zoish/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
AttributeError: 'RandomBestEstimator' object has no attribute 'decision_function'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/hjavedani/Documents/zoish/.venv/lib/python3.1

                 0               1
0              age             age
1        workclass       workclass
2           fnlwgt          fnlwgt
3        education       education
4    education-num   education-num
5   marital-status  marital-status
6       occupation      occupation
7     relationship    relationship
8             race            race
9              sex             sex
10    capital-gain    capital-gain
11    capital-loss    capital-loss
12  hours-per-week  hours-per-week
13  native-country  native-country
age
age


TypeError: '>=' not supported between instances of 'str' and 'float'

#### Check performance of the Pipeline

In [None]:

print("F1 score : ")
print(f1_score(y_test, y_pred))
print("Classification report : ")
print(classification_report(y_test, y_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_pred))


#### Plot summary plot for selected features

In [None]:
ShapFeatureSelector.shap_feature_selector_factory.plot_features_all(
    type_of_plot="summary_plot",
    path_to_save_plot="../plots/random_search_classification_3_summary_plot"
)


#### Plot summary plot for full features

In [None]:
ShapFeatureSelector.shap_feature_selector_factory.plot_features_all(
    type_of_plot="summary_plot_full",
    path_to_save_plot="../plots/random_search_classification_3_summary_plot_full"
)


#### Bar plot for selected features

In [None]:
ShapFeatureSelector.shap_feature_selector_factory.plot_features_all(
    type_of_plot="bar_plot",
    path_to_save_plot="../plots/random_search_classification_3_bar_plot"
)

#### Bar plot for all features

In [None]:
ShapFeatureSelector.shap_feature_selector_factory.plot_features_all(
    type_of_plot="bar_plot_full",
    path_to_save_plot="../plots/random_search_classification_3_bar_plot_full"
)

#### Decision plot for selected features (not more than samples can be used !)

In [None]:
ShapFeatureSelector.shap_feature_selector_factory.plot_features_all(
    type_of_plot="decision_plot",
    path_to_save_plot="../plots/random_search_classification_3_decision_plot"
)

#### Get list of features and grades

In [None]:
ShapFeatureSelector.shap_feature_selector_factory.get_list_of_features_and_grades()