In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import onnxruntime as rt
import onnx
import xgboost as xgb
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from scipy.stats import ks_2samp

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Model 1

In [2]:
# Modify the data files
train_data = pd.read_csv('data/train_data_1.csv')
test_data = pd.read_csv('data/test_data_1.csv')

y_train = train_data['checked']
X_train = train_data.drop(['checked'], axis=1)
X_train = X_train.astype(np.float32)

y_test = test_data['checked']
X_test = test_data.drop(['checked'], axis=1)
X_test = X_test.astype(np.float32)

In [3]:
# Select important features
selector = SelectFromModel(RandomForestClassifier(class_weight='balanced'))

In [4]:
# Use XGBoost as classifier
classifier = xgb.XGBClassifier(objective='binary:logistic')

In [5]:
# Create a pipeline object with our selector and classifier
pipeline = Pipeline(steps=[('feature_selection', selector), ('classification', classifier)])

In [6]:
# Cross-validate pipeline
# Define the parameter grid for grid search
param_grid = {
    'feature_selection__max_features': [50, 75, 100],
    'classification__learning_rate': [0.1, 0.2, 0.3],
}

# Create a GridSearchCV object with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, scoring= 'roc_auc', cv=5, verbose= 2)

# Perform grid search with cross-validation
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Print the best cross-validation score
print("Best Cross-Validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   4.3s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   3.2s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   3.1s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   3.1s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=50; total time=   3.1s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   3.2s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   3.2s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   3.2s
[CV] END classification__learning_rate=0.1, feature_selection__max_features=75; total time=   3.2s
[CV] END classification__learning_rate=0.1, featu

In [7]:
# Update pipeline
pipeline.named_steps['classification'].set_params(learning_rate= grid_search.best_params_['classification__learning_rate'])
pipeline.named_steps['feature_selection'].set_params(max_features= grid_search.best_params_['feature_selection__max_features'])

In [8]:
model = pipeline
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
print('Accuracy of the original model: ', original_accuracy)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      2278
           1       0.86      0.56      0.68       251

    accuracy                           0.95      2529
   macro avg       0.91      0.78      0.83      2529
weighted avg       0.94      0.95      0.94      2529

Accuracy of the original model:  0.9474100434954528


In [9]:
from skl2onnx import update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa

update_registered_converter(
    xgb.XGBClassifier,
    "XGBoostClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]}
)
# Convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X_train.shape[1])))],
    target_opset=12)

# Check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9474100434954528


In [10]:
# Save the model
onnx.save(onnx_model, "model/model_1.onnx")

# Load the model
new_session = rt.InferenceSession("model/model_1.onnx")

# Predict the target
y_pred_onnx2 =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx2[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9474100434954528
