In [2]:
!pip install skl2onnx
!pip install onnxruntime

Collecting onnxruntime
  Using cached onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Using cached coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Using cached humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Using cached onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
Using cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Using cached humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
Installing collected packages: humanfriendly, coloredlogs, onnxruntime
Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime-1.20.1


In [3]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper

from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn

import onnxruntime as rt

from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer

import graphviz

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/My Drive/Colab Notebooks/
data = pd.read_csv("data_public.csv")

/content/drive/My Drive/Colab Notebooks


In [6]:
# Separate features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [23]:
# Define the preprocessing steps for numeric features
numeric_features = X.columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
     ('scaler', StandardScaler()) ])

In [24]:
# Combine preprocessing steps in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

In [25]:
# Define the pipeline with preprocessing, PCA, feature selection, and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=10)),
    ('selector', SelectKBest(k=9)),
    ('classifier', RandomForestClassifier(
        random_state=42,
        n_estimators=200,
        max_depth=10,
        min_samples_split=10 ))
])

In [26]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [27]:
# Predictions and evaluation
print(classification_report(pipeline.predict(X_train),
                            y_train))

              precision    recall  f1-score   support

           1       0.45      0.56      0.50    130649
           2       1.00      0.75      0.86    539809
           3       0.63      0.78      0.70    409542

    accuracy                           0.74   1080000
   macro avg       0.69      0.70      0.68   1080000
weighted avg       0.79      0.74      0.75   1080000



In [28]:
pipeline.predict(X_test)

array([2, 2, 3, ..., 2, 3, 1])

In [29]:
y_test.values.ravel()

array([2, 3, 1, ..., 2, 3, 1])

In [30]:
print(classification_report(pipeline.predict(X_test),
                            y_test))

              precision    recall  f1-score   support

           1       0.41      0.50      0.45     14811
           2       1.00      0.75      0.86     60019
           3       0.61      0.76      0.68     45170

    accuracy                           0.72    120000
   macro avg       0.67      0.67      0.66    120000
weighted avg       0.78      0.72      0.74    120000



In [31]:
# Convert the pipeline to ONNX format
input_types = [(col, FloatTensorType([None, 1])) for col in X.columns]

try:
  model_onnx = convert_sklearn(pipeline, initial_types=input_types)
  with open("pipeline_model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())
  print("Model successfully converted and saved to pipeline_model.onnx")
except Exception as e:
  print("Error during conversion:", e)

Model successfully converted and saved to pipeline_model.onnx


In [33]:
inputs_onnx = {k: np.array(v).astype(np.float32)[:, np.newaxis] for k, v in X_test.to_dict(orient='list').items()}

session_onnx = rt.InferenceSession("pipeline_model.onnx")
predict_onnx = session_onnx.run(None, inputs_onnx)
print("predict", predict_onnx[0])

predict [2 2 3 ... 2 3 1]
