#### Install required libraries


In [1]:
! pip install git+https://github.com/TorkamaniLab/zoish.git 
! pip install feature-engine category-encoders scikit-learn ipywidgets numpy pandas xgboost --force-reinstall

[0mProcessing /Users/hjavedani/Documents/zoish
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: zoish
  Building wheel for zoish (setup.py) ... [?25ldone
[?25h  Created wheel for zoish: filename=zoish-5.0.3-py3-none-any.whl size=36370 sha256=c622ced7ce5e8f8976a20c1d64ae19b5894d7c075d45fa2daca812aad5f62532
  Stored in directory: /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-ephem-wheel-cache-bxdxfb6r/wheels/0d/32/30/eb3f109eb50e943841e5dc2a87f7bec30f99b39562aff8e67a
Successfully built zoish
[0m[33mDEPRECATION: xgbse 0.2.3 has a non-standard dependency specifier pandas>=1.0.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of xgbse or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: zoish
  Attempting uninstall: zoish


In [2]:
# Importing built-in libraries
import pandas as pd  # For data manipulation and analysis
import sys  # For accessing system-specific parameters and functions
import zoish  # Assuming it's a custom library for your project
import sklearn  # For machine learning models
import xgboost  # For gradient-boosted decision trees
import numpy  as np # For numerical computations
from sklearn.datasets import make_classification, make_regression
# Importing scikit-learn utilities for various ML tasks
from sklearn.compose import ColumnTransformer  # For applying transformers to columns
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.impute import SimpleImputer  # For handling missing data
from sklearn.metrics import (  # For evaluating the model
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
)
from sklearn.model_selection import GridSearchCV, train_test_split  # For CV and splitting dataset
from sklearn.pipeline import Pipeline  # For creating ML pipelines
from sklearn.preprocessing import StandardScaler  # For feature scaling

from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    ExtraTreesClassifier,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from zoish.feature_selectors.shap_selectors import ShapFeatureSelector
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import KFold

# Importing other third-party libraries
from category_encoders import TargetEncoder  # For encoding categorical variables
from xgboost import XGBClassifier  # XGBoost classifier
from zoish.feature_selectors.shap_selectors import (  # For feature selection and visualization
    ShapFeatureSelector,
    ShapPlotFeatures,
)
import logging  # For logging events and errors

# Configuring logging settings
from zoish import logger  # Assuming it's a custom logger from zoish
logger.setLevel(logging.ERROR)  # Set logging level to ERROR

# Importing feature imputation library
from feature_engine.imputation import MeanMedianImputer  # For imputing mean/median

# Re-setting logging level (this seems redundant, consider keeping only one)
logger.setLevel(logging.ERROR)
RANDOM_SEED = 42  # Random seed for reproducibility
# Printing versions of key libraries for debugging and documentation
print(f'Python version : {sys.version}')
print(f'zoish version : {zoish.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'pandas version : {pd.__version__}')  # Using the alias for pandas
print(f'numpy version : {np.__version__}')
print(f'xgboost version : {xgboost.__version__}')


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


Python version : 3.7.8 (default, Feb 27 2023, 18:11:31) 
[Clang 14.0.0 (clang-1400.0.29.202)]
zoish version : 5.0.3
sklearn version : 1.0.2
pandas version : 1.3.5
numpy version : 1.21.6
xgboost version : 1.6.2


#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [3]:
def binary_classification_dataset():
    np.random.seed(RANDOM_SEED)  # Set random seed before generating dataset
    X, y = make_classification(
        n_samples=100,
        n_features=10,
        n_informative=2,
        n_redundant=5,
        n_classes=2,
        random_state=RANDOM_SEED,
    )
    X[:, : int(X.shape[1] * 0.5)] += np.random.normal(
        0, 1, (X.shape[0], int(X.shape[1] * 0.5))
    )  # Making 50% features important
    return pd.DataFrame(X, columns=[f"feature_{i}" for i in range(10)]), y

In [4]:

classifiers_binery = [
    RandomForestClassifier(n_estimators=10,  random_state=RANDOM_SEED),
    ExtraTreesClassifier(n_estimators=10,  random_state=RANDOM_SEED),
    # GradientBoostingClassifier(random_state=RANDOM_SEED),  # Uncomment this if you want to use it
    DecisionTreeClassifier(random_state=RANDOM_SEED),
    XGBClassifier(
        
        random_state=RANDOM_SEED,
        colsample_bytree=1,  # Use 100% of the features in each tree
        colsample_bylevel=1,  # Use 100% of the features at each level of the tree
        subsample=1,
    ),  # Use 100% of the data (rows) in each tree
    LGBMClassifier(
        random_state=RANDOM_SEED,
        colsample_bytree=1,  # Use 100% of the features in each tree
        subsample=1,
    ),  # Use 100% of the data (rows) in each tree
    CatBoostClassifier(
        silent=True,
        thread_count=1,
        random_seed=RANDOM_SEED,
        colsample_bylevel=1,  # Use 100% of the features at each level of the tree
        subsample=1,
        bootstrap_type="Bernoulli",
    ),  # Use 100% of the data (rows) in each tree
]


In [5]:
def test_shap_feature_selector_binary_classification(
    model, binary_classification_dataset
):
    X, y = binary_classification_dataset
    model.fit(X, y)
    selector = ShapFeatureSelector(
        model,
        num_features=int(X.shape[1] * 0.5),
        n_iter=5,
        direction="maximum",
        scoring="f1",
        cv=KFold(n_splits=5, shuffle=True),
    )  # Select top 50% features
    selector.fit(X, y)
    X_transformed = selector.transform(X)
    assert X_transformed.shape[1] == int(X.shape[1] * 0.5)
    original_score = model.score(X, y)
    model.fit(X_transformed, y)
    selected_features_score = model.score(X_transformed, y)
    assert selected_features_score >= original_score - 0.1


#### Performance has improved

In [6]:
test_shap_feature_selector_binary_classification(
    LGBMClassifier(), binary_classification_dataset()
)

ERROR:zoish.feature_selectors.shap_selectors:Attempt 1: Shap TreeExplainer could not be used: argument of type 'NoneType' is not iterable
ERROR:zoish.feature_selectors.shap_selectors:Both TreeExplainer and KernelExplainer failed: predict() argument after ** must be a mapping, not NoneType


[LightGBM] [Info] Number of positive: 50, number of negative: 50
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 350
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Provided model function fails when applied to the provided data set.


TypeError: predict() argument after ** must be a mapping, not NoneType

In [None]:

print("F1 score : ")
print(f1_score(y_test, y_test_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_test_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_test_pred))



#### Shap related plots

In [None]:

# Plot the feature importance
plot_factory = ShapPlotFeatures(shap_feature_selector) 
plot_factory.summary_plot()


In [None]:
plot_factory.summary_plot_full()

In [None]:

# Plot the feature importance
plot_factory.bar_plot()

In [None]:
plot_factory.bar_plot_full()

In [None]:
plot_factory.dependence_plot('special forms')

#### Feature importance data frame

In [None]:
feature_selection.importance_df

In [None]:
# name of features
X_train.columns