#### Install required libraries


In [8]:
! pip install zoish
! pip install feature-engine gpboost category-encoders scikit-learn ipywidgets numpy pandas --force-reinstall

[0mCollecting zoish==4.7.0
  Using cached zoish-4.7.0-py3-none-any.whl.metadata (13 kB)
Collecting jedi==0.19.0 (from zoish==4.7.0)
  Using cached jedi-0.19.0-py2.py3-none-any.whl.metadata (22 kB)
Collecting packaging==23.1 (from zoish==4.7.0)
  Using cached packaging-23.1-py3-none-any.whl (48 kB)
Collecting prompt-toolkit==3.0.39 (from zoish==4.7.0)
  Using cached prompt_toolkit-3.0.39-py3-none-any.whl.metadata (6.4 kB)
Collecting Pygments==2.16.1 (from zoish==4.7.0)
  Using cached Pygments-2.16.1-py3-none-any.whl.metadata (2.5 kB)
Collecting virtualenv==20.24.4 (from zoish==4.7.0)
  Using cached virtualenv-20.24.4-py3-none-any.whl.metadata (4.5 kB)
Collecting wcwidth==0.2.6 (from zoish==4.7.0)
  Using cached wcwidth-0.2.6-py2.py3-none-any.whl (29 kB)
INFO: pip is looking at multiple versions of zoish to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Cannot install zoish==4.7.0 because these package versions have conflicting depend

In [6]:
# Importing built-in libraries
import pandas as pd  # For data manipulation and analysis
import sys  # For accessing system-specific parameters and functions
import zoish  # Assuming it's a custom library for your project
import sklearn  # For machine learning models
import numpy  # For numerical computations
import gpboost

# Importing scikit-learn utilities for various ML tasks
from sklearn.compose import ColumnTransformer  # For applying transformers to columns
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.impute import SimpleImputer  # For handling missing data
from sklearn.metrics import (  # For evaluating the model
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
)
from sklearn.model_selection import GridSearchCV, train_test_split  # For CV and splitting dataset
from sklearn.pipeline import Pipeline  # For creating ML pipelines
from sklearn.preprocessing import StandardScaler  # For feature scaling

# Importing other third-party libraries
from category_encoders import TargetEncoder  # For encoding categorical variables
from zoish.feature_selectors.shap_selectors import (  # For feature selection and visualization
    ShapFeatureSelector,
    ShapPlotFeatures,
)
import logging  # For logging events and errors

# Configuring logging settings
from zoish import logger  # Assuming it's a custom logger from zoish
logger.setLevel(logging.ERROR)  # Set logging level to ERROR

# Importing feature imputation library
from feature_engine.imputation import MeanMedianImputer  # For imputing mean/median

# Re-setting logging level (this seems redundant, consider keeping only one)
logger.setLevel(logging.ERROR)

# Printing versions of key libraries for debugging and documentation
print(f'Python version : {sys.version}')
print(f'zoish version : {zoish.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'pandas version : {pd.__version__}')  # Using the alias for pandas
print(f'numpy version : {numpy.__version__}')
print(f'gpboost version : {gpboost.__version__}')


Python version : 3.7.17 (default, Aug 31 2023, 09:57:32) 
[Clang 14.0.3 (clang-1403.0.22.14.1)]
zoish version : 4.6.0
sklearn version : 1.0.2
pandas version : 1.3.5
numpy version : 1.21.6
gpboost version : 1.2.6


#### Example: Audiology (Standardized) Data Set
###### https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29


#### Read data


In [10]:
import gpboost as gpb
import numpy as np
from sklearn.datasets import make_classification
# Generate classification data for mixed effect model
X, y = make_classification(n_samples=100, n_features=10, n_informative=2, n_redundant=0, random_state=42)
X = pd.DataFrame(X)

# Generate random effects
n_groups = 5 # Reduced number of groups for simplicity
groups = np.random.choice(n_groups, size=X.shape[0])

# Define fixed group effects
group_effects = np.random.normal(0, 1, n_groups)  # Random effects for each group
random_effects = group_effects[groups]

# Adjust y based on random effects
y = np.where(y + random_effects > 0, 1, 0)

#### Defining the feature pipeline steps:
Here, we use an untuned gpb.GPBoostClassifier model with the ShapFeatureSelector.In the next section, we will repeat the same process but with a tuned gpb.GPBoostClassifier. The aim is to demonstrate that a better estimator can yield improved results when used with the ShapFeatureSelector.


In [13]:
from sklearn.model_selection import KFold


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33,  random_state=42
)

model = gpb.GPBoostClassifier(boosting_type='gbdt',
        objective='binary',  # 'binary' is for binary classification
        n_estimators=100,  # Equivalent to num_boost_round in gpboost.train
        group_data=groups  # Pass the groups for random effects
        )
model.fit(X_train, y_train)
shap_feature_selector = ShapFeatureSelector(
        model,
        n_iter=10,
        scoring="f1",
        direction="maximum",
        cv=KFold(n_splits=2, shuffle=True),
        # for gpboost this should be False
        use_faster_algorithm=False,
        threshold=0.01,
        shap_fast_tree_explainer_kwargs={'algorithm':'v2'}
)

        
# Define pre-processing for numeric columns (float and integer types)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Define pre-processing for categorical features
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder(handle_missing='return_nan'))])

# Combine preprocessing into one column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature Selection using ShapSelector 
feature_selection = shap_feature_selector 

# Classifier model
classifier = gpb.GPBoostClassifier(boosting_type='gbdt',
        objective='binary',  # 'binary' is for binary classification
        n_estimators=100,  # Equivalent to num_boost_round in gpboost.train
        group_data=groups  # Pass the groups for random effects
        )

# Create a pipeline that combines the preprocessor with a feature selection and a classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selection', feature_selection),
                           ('classifier', classifier)])

# Fit the model
pipeline.fit(X, y)

# Predict on test data
y_pred = pipeline.predict(X)

# Output first 10 predictions
print(y_pred[:5])

ERROR:zoish.feature_selectors.shap_selectors:Shap TreeExplainer could not be used: Model type not yet supported by TreeExplainer: <class 'gpboost.sklearn.GPBoostClassifier'>


InvalidModelError: Model type not yet supported by TreeExplainer: <class 'gpboost.sklearn.GPBoostClassifier'>

#### Check performance of the Pipeline


In [None]:

print("F1 score : ")
print(f1_score(y_test, y_test_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_test_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_test_pred))




#### Use better estimator:
In this iteration, we will utilize the optimally tuned estimator with the ShapFeatureSelector, which is expected to yield improved results."

In [None]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()


In [None]:

# Define the XGBClassifier
xgb_clf = XGBClassifier()

# Define the parameter grid for XGBClassifier
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [ 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
}

# Define the scoring function
scoring = make_scorer(f1_score, average='micro')  # Use 'micro' average in case of multiclass target

# Set up GridSearchCV
grid_search = GridSearchCV(xgb_clf, param_grid, cv=5, scoring=scoring, verbose=1)
grid_search.fit(X_train, y_train)
# Fit the GridSearchCV object
estimator_for_feature_selector= grid_search.best_estimator_ 
shap_feature_selector = ShapFeatureSelector(model=estimator_for_feature_selector, num_features=5, scoring='accuracy', algorithm='auto',cv = 5, n_iter=10, direction='maximum')


pipeline =Pipeline([
            # int missing values imputers
            ('floatimputer', MeanMedianImputer(
                imputation_method='mean', variables=int_cols)),
           
            ('shap_feature_selector', shap_feature_selector),
            ('classfier', RandomForestClassifier(n_estimators=100))


 ])


# Fit the model
pipeline.fit(X_train, y_train)

# Predict on test data
y_test_pred = pipeline.predict(X_test)

# Output first 10 predictions
print(y_test_pred[:10])
            

#### Performance has improved

In [None]:

print("F1 score : ")
print(f1_score(y_test, y_test_pred,average='micro'))
print("Classification report : ")
print(classification_report(y_test, y_test_pred))
print("Confusion matrix : ")
print(confusion_matrix(y_test, y_test_pred))



#### Shap related plots

In [None]:

# Plot the feature importance
plot_factory = ShapPlotFeatures(shap_feature_selector) 
plot_factory.summary_plot()


In [None]:
plot_factory.summary_plot_full()

In [None]:

# Plot the feature importance
plot_factory.bar_plot()

In [None]:
plot_factory.bar_plot_full()

In [None]:
plot_factory.dependence_plot('special forms')