# Export tabular model to ONNX format

In [53]:
import numpy as np
import onnxruntime as rt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier, XGBRegressor, DMatrix, train as train_xgb
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
from skl2onnx import convert_sklearn, to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
    calculate_linear_regressor_output_shapes
)

from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
import joblib
import pickle
from scipy.special import softmax

## Preprocessors

copied from [Tabular_Model_F_Adam.ipynb](../Tabular_Model_F_Adam.ipynb)

In [2]:
# Define data type columns
ordinal_cols = ['pain', 'acuity']
ratio_cols   = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']

# Preprocess parameters
impOrd   = SimpleImputer(strategy='constant', fill_value=-1)
impRatio = SimpleImputer(strategy='mean')
scale    = RobustScaler(with_centering=False)
encode   = OrdinalEncoder()

# Simple imputing Preprocess
ord_pp_steps  = Pipeline([('missing',impOrd),('Ordinal',encode),('Scale',scale)])
ratio_pp_steps= Pipeline([('mean',impRatio),('Scale',scale)])

# create the preprocessor stage of final pipeline
t=[("ordinal",ord_pp_steps,ordinal_cols),('ratio',ratio_pp_steps ,ratio_cols)]
preprocessor = ColumnTransformer(transformers = t)

In [3]:
preprocessor

### load training data to fit the preprocessor

In [4]:
def load_data(file_path):
    """
    Load data from a CSV file and format columns
    Parameters:
    - file_path: Path to the CSV file
    Returns:
    - DataFrame with loaded data and additional column for data type.
    """
    
    df = pd.read_csv(file_path)
    df['temperature']  = pd.Series(df['temperature']).astype(float)
    df['heartrate']    = pd.Series(df['heartrate']).astype(float)
    df['resprate']     = pd.Series(df['resprate']).astype(float)
    df['o2sat']        = pd.Series(df['o2sat']).astype(float)
    df['sbp']          = pd.Series(df['sbp']).astype(float)
    df['dbp']          = pd.Series(df['dbp']).astype(float)
    df['pain']         = pd.Series(df['pain']).astype("Int64")
    df['acuity']       = pd.Series(df['acuity']).astype("Int64")
    
    return df

In [5]:
X_train = load_data('../../data/X_train_4_bal_s.csv')

In [6]:
X_train

Unnamed: 0,patient_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity
0,10000980,97.8,57.0,18.0,100.0,180.0,88.0,0,2
1,10004322,97.5,91.0,17.0,100.0,126.0,81.0,0,3
2,10030487,97.0,72.0,18.0,100.0,122.0,57.0,0,3
3,10032409,97.7,120.0,20.0,100.0,168.0,94.0,10,3
4,10039360,99.2,59.0,18.0,100.0,201.0,99.0,9,2
...,...,...,...,...,...,...,...,...,...
2060,16586729,103.3,118.0,22.0,100.0,145.0,68.0,2,3
2061,15845966,98.8,93.0,20.0,100.0,138.0,79.0,0,2
2062,18166516,98.2,86.0,18.0,99.0,135.0,91.0,5,2
2063,10882818,99.0,98.0,20.0,100.0,139.0,83.0,4,3


In [7]:
preprocessor.fit(X_train)

## Load XGBoost model

In [8]:
file_name = '../Saved_Models/xgb_tabular_model_4-1-24.pkl'

with open(file_name, 'rb') as file:
    loaded_xgb = pickle.load(file)

In [9]:
loaded_xgb

In [10]:
pipe = Pipeline([('preprocess',preprocessor), ('estimator', loaded_xgb)])

In [11]:
pipe

In [12]:
val = X_train.head(1).drop(columns=['patient_id'])
val

Unnamed: 0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity
0,97.8,57.0,18.0,100.0,180.0,88.0,0,2


In [13]:
out = pipe.predict_proba(val)
out

array([[0.3197859 , 0.08549188, 0.34794137, 0.09823816, 0.1485427 ]],
      dtype=float32)

In [14]:
out[0].sum()

1.0

In [15]:
try:
    convert_sklearn(
        pipe,
        "pipeline_tabular_model",
        [
            ('temperature', FloatTensorType([None, 1])),
            ('heartrate', FloatTensorType([None, 1])),
            ('resprate', FloatTensorType([None, 1])),
            ('o2sat', FloatTensorType([None, 1])),
            ('sbp', FloatTensorType([None, 1])),
            ('dbp', FloatTensorType([None, 1])),
            ('pain', Int64TensorType([None, 1])),
            ('acuity', Int64TensorType([None, 1]))
        ])
except Exception as e:
    print(e)

Unable to find a shape calculator for type '<class 'xgboost.sklearn.XGBClassifier'>'.
It usually means the pipeline being converted contains a
transformer or a predictor with no corresponding converter
implemented in sklearn-onnx. If the converted is implemented
in another library, you need to register
the converted so that it can be used by sklearn-onnx (function
update_registered_converter). If the model is not yet covered
by sklearn-onnx, you may raise an issue to
https://github.com/onnx/sklearn-onnx/issues
to get the converter implemented or even contribute to the
project. If the model is a custom model, a new converter must
be implemented. Examples can be found in the gallery.



  ar = np.array([op.missing_values]).astype(np.int64)


In [64]:
update_registered_converter(
    XGBClassifier,
    'XGBoostXGBClassifier',
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [False, True]})

In [66]:
model_onnx = convert_sklearn(
        pipe,
        "pipeline_tabular_model",
        [
            ('temperature', FloatTensorType([1])),
            ('heartrate', FloatTensorType([1])),
            ('resprate', FloatTensorType([1])),
            ('o2sat', FloatTensorType([1])),
            ('sbp', FloatTensorType([1])),
            ('dbp', FloatTensorType([1])),
            ('pain', Int64TensorType([1])),
            ('acuity', Int64TensorType([1]))
        ],
        )
with open("../../onnx_models/pipeline_tabular_model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [67]:
val

Unnamed: 0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity
0,97.8,57.0,18.0,100.0,180.0,88.0,0,2


In [68]:
sess = rt.InferenceSession("../../onnx_models/pipeline_tabular_model.onnx", providers=["CPUExecutionProvider"])
pred_onx = sess.run(None, {
    "temperature": [97.8],
    "heartrate": [57.0],
    "resprate": [18.0],
    "o2sat": [100.0],
    "sbp": [180.0],
    "dbp": [88.0],
    "pain": [0],
    "acuity": [2]
    })
pred_onx

[array([2], dtype=int64),
 [{0: 1.0796387195587158,
   1: -0.2395918369293213,
   2: 1.1640210151672363,
   3: -0.10061836242675781,
   4: 0.31285932660102844}]]

In [26]:
out = pipe.predict_proba(val)
out

array([[0.3197859 , 0.08549188, 0.34794137, 0.09823816, 0.1485427 ]],
      dtype=float32)

In [27]:
softmax(list(pred_onx[1][0].values()))

array([0.31978591, 0.08549186, 0.34794139, 0.09823815, 0.14854268])