## Import packages

In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [2]:
data = pd.read_csv("input_data_to_SAG.csv")

In [3]:
data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,11,9,1,11,9,0,0,0,2,15,5,0


In [4]:
X = data[data.columns.drop('target')]
y = data['target']

## Build base estimators

In [5]:
xgb1 = XGBClassifier()
xgb1.fit(X,y)

xgb2 = XGBClassifier()
xgb2.fit(X,y)

xgb3 = XGBClassifier()
xgb3.fit(X,y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Custom pre-processing

In [6]:
extra_cols = ['A','B','C']
base_estimators = [xgb1,xgb2,xgb3]

X_meta = [X[extra_cols].reset_index(drop=True)]
        
for ei, e in enumerate(base_estimators):
    y_pred = e.predict_proba(X)
    X_meta.append(pd.DataFrame(y_pred, columns=[f'pred_{ei}_probability_{i}' for i in range(y_pred.shape[1])]))

X_meta = pd.concat(X_meta, axis=1).reset_index(drop=True)

In [7]:
X_meta.head()

Unnamed: 0,A,B,C,pred_0_probability_0,pred_0_probability_1,pred_1_probability_0,pred_1_probability_1,pred_2_probability_0,pred_2_probability_1
0,0,0,0,0.455779,0.544221,0.455779,0.544221,0.455779,0.544221
1,0,0,0,0.455779,0.544221,0.455779,0.544221,0.455779,0.544221
2,0,0,0,0.455779,0.544221,0.455779,0.544221,0.455779,0.544221
3,0,0,0,0.455779,0.544221,0.455779,0.544221,0.455779,0.544221
4,0,0,0,0.947595,0.052405,0.947595,0.052405,0.947595,0.052405


## Build final estimator model

In [8]:
xgb4 = XGBClassifier()
xgb4.fit(X_meta,y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Define custom transformer

In [9]:
# Custome transformer
class Custom_Transformer(BaseEstimator, ClassifierMixin):
    
    def __init__(self, base_estimators, final_estimators, extra_cols):
        self.base_estimators = base_estimators
        self.final_estimators = final_estimators
        self.extra_cols = extra_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return self
    
    def predict_proba(self, X):

        X_meta = [X[extra_cols].reset_index(drop=True)]
        
        for ei, e in enumerate(base_estimators):
            y_pred = e.predict_proba(X)
            X_meta.append(pd.DataFrame(y_pred, columns=[f'pred_{ei}_probability_{i}' for i in range(y_pred.shape[1])]))
            
        X_meta = pd.concat(X_meta, axis=1).reset_index(drop=True)
        
        return self.final_estimators.predict_proba(X_meta)

In 'pred_{ei}_probability_{i}' :
- it is mandatory to suffix '_probability_' with the predicted class
- Name 'pred' can be replaced by anything (ex: 'target')

In [10]:
final_estimators = xgb4

In [11]:
# Sklearn pipeline and prediction
model_object = Custom_Transformer(base_estimators,final_estimators,extra_cols)
pipeline_obj = Pipeline([("xgb", model_object)])

## Install new Nyoka version from GitHub

In [12]:
! pip install git+https://github.com/SoftwareAG/nyoka.git@custom-xgb

Collecting git+https://github.com/SoftwareAG/nyoka.git@custom-xgb
  Cloning https://github.com/SoftwareAG/nyoka.git (to revision custom-xgb) to /private/var/folders/f3/yvlftshx1fl8dgwnqypc9ww00000gp/T/pip-req-build-17dkdj_3
  Running command git clone -q https://github.com/SoftwareAG/nyoka.git /private/var/folders/f3/yvlftshx1fl8dgwnqypc9ww00000gp/T/pip-req-build-17dkdj_3
  Running command git checkout -b custom-xgb --track origin/custom-xgb
  Switched to a new branch 'custom-xgb'
  Branch 'custom-xgb' set up to track remote branch 'custom-xgb' from 'origin'.
Building wheels for collected packages: nyoka
  Building wheel for nyoka (setup.py) ... [?25ldone
[?25h  Created wheel for nyoka: filename=nyoka-6.0.0-py3-none-any.whl size=303811 sha256=cebb52c4370ca466d33226ee491b0041205525421e8d7d47cf8e6c2f763dc558
  Stored in directory: /private/var/folders/f3/yvlftshx1fl8dgwnqypc9ww00000gp/T/pip-ephem-wheel-cache-erab79v8/wheels/1e/23/52/1726ba530d45212e6fe4b8f86c2fada79c035ac5d743617a6a
Su

## Import Nyoka

In [13]:
from nyoka import pipeline_to_pmml

## Custom pipeline export to PMML

The only caveat is to have the 'target_name' within the 'pipeline_to_pmml' exporter same as the name defined within the 'predict_proba' method of the custom transformer class. (i.e. 'pred' in this example)

In [14]:
features = list(data.columns.drop('target'))

In [15]:
pipeline_to_pmml(pipeline=pipeline_obj,col_names=features,target_name="pred",pmml_f_name="xgb_pipeline.pmml")