# Converting sklearn UF model to onnx

----
## Overview

Test case of a **Linear Regression and DecisionTreeRegressor model** fitted to UF data and converted to onnx to be used in APS.



 


## Log

* 27-April-2025: Initial notebook 
* 28-April-2025: Added export of other models and removed concentrate as an input


## TODO

* Evaluate other features
* ~~Convert permate output to flux~~
* ~~Use ONNX to export model~~ https://scikit-learn.org/stable/model_persistence.html 


## Loading and Exploring the Data
Run the code cell below to load necessary Python libraries and load the data. 



In [77]:
###########################################
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


import numpy as np
import pandas as pd
from time import time

import plotly.express as px
import plotly.figure_factory as ff

from sklearn.model_selection import cross_val_predict, learning_curve, GridSearchCV, cross_val_score, ShuffleSplit
from sklearn.metrics import fbeta_score, make_scorer, r2_score,mean_squared_error,accuracy_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesRegressor,RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso,LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split

import skl2onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.proto import get_latest_tested_opset_version
from skl2onnx.common._topology import get_default_opset_for_domain
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.helpers.onnx_helper import save_onnx_model
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes


import onnxruntime as rt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [78]:
 
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test,Average, lin = True): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
       - parameter is required for multiclass/multilabel targets 
         https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
    '''
    
    results = {}
    
    # : Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner.fit(X_train[:sample_size],y_train[:sample_size])
    end = time() # Get end time
    results['model'] = learner.__class__.__name__
    results["model_learner"] = learner
    results["sample_size"] = sample_size
    # : Calculate the training time
    results['train_time'] = end-start
        
    # : Get the predictions on the test set(X_test),
    #       then get predictions on the training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # : Calculate the total prediction time
    results['pred_time'] = end-start
    
    
    if lin:
        # : Compute accuracy on the training samples 
        results['r2_train'] = r2_score(y_train,predictions_train)

        # : Compute accuracy on test set using accuracy_score()
        results['r2_test'] = r2_score(y_test,predictions_test)

        # : Compute F-score on the  training samples using fbeta_score()
        results['mse_train'] = mean_squared_error(y_train,predictions_train)

        # : Compute F-score on the test set which is y_test
        results['mse_test'] = mean_squared_error(y_test,predictions_test)
    else:
        # : Compute accuracy on the training samples 
        results['acc_train'] = accuracy_score(y_train,predictions_train)

        # : Compute accuracy on test set using accuracy_score()
        results['acc_test'] = accuracy_score(y_test,predictions_test)

        # : Compute F-score on the  training samples using fbeta_score()
        results['f_train'] = fbeta_score(y_train,predictions_train,beta=0.5,average=Average)

        # : Compute F-score on the test set which is y_test
        results['f_test'] = fbeta_score(y_test,predictions_test,beta=0.5,average=Average)
       
    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    results["predictions_test"] = predictions_test   
    results["predictions_train"] = predictions_train
    # Return the results
    return results



In [79]:
DataSource ='data/uf_data.csv'

df = pd.read_csv(DataSource)
df.head()


Unnamed: 0,TMP_average,flow_feed,conductivity,RTS,temp_average,flux
0,54004.766,0.009328,4.76787,7.992932,291.75406,1e-06
1,54116.473,0.009248,4.76637,7.993019,291.75214,1e-06
2,54228.18,0.009198,4.764869,8.012278,291.75024,1e-06
3,54339.887,0.00916,4.763368,8.037834,291.74832,1e-06
4,54516.883,0.00921,4.754021,8.065409,291.74643,1e-06


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TMP_average   1100 non-null   float64
 1   flow_feed     1100 non-null   float64
 2   conductivity  1100 non-null   float64
 3   RTS           1100 non-null   float64
 4   temp_average  1100 non-null   float64
 5   flux          1100 non-null   float64
dtypes: float64(6)
memory usage: 51.7 KB


In [81]:
df_clean = df.copy()
numeric_features = df_clean.select_dtypes(include=[np.number]).columns.tolist()

In [82]:
# Scaterplot matrix of the data

fig = ff.create_scatterplotmatrix(df, diag='histogram',index="TMP_average" ,colormap='Blues', height=1200, width=1200)



fig.layout.yaxis.gridcolor = "#DBDDDB"
fig.layout.xaxis.gridcolor = "#DBDDDB"
fig.layout.yaxis.linecolor = "#000000"
fig.layout.xaxis.linecolor = "#000000"
fig.layout.plot_bgcolor = "#ffffff"
fig.show()

In [83]:
fig =px.scatter(x=df_clean["TMP_average"]/1e5,  y=df_clean["flux"])
fig = px.scatter(x=df_clean["TMP_average"]/1e5,  y=df_clean["flux"], width=1200, height=700, marginal_x="histogram", marginal_y="histogram")
#fig.update_traces(mode="markers+lines", hovertemplate=None)
fig.update_layout(hovermode="x")
fig.update_xaxes(title_text="TMP_average")
fig.update_yaxes(title_text="flux")
fig.layout.yaxis.gridcolor = "#DBDDDB"
fig.layout.xaxis.gridcolor = "#DBDDDB"
fig.layout.yaxis.linecolor = "#000000"
fig.layout.xaxis.linecolor = "#000000"
fig.layout.plot_bgcolor = "#ffffff"
fig.show()

###  Data Exploration
A cursory investigation of the dataset to determine relevant statistics 

In [84]:
# Display stats for the dataset
data =df_clean.dropna(how='all', axis=1).copy()
data.describe()

Unnamed: 0,TMP_average,flow_feed,conductivity,RTS,temp_average,flux
count,1100.0,1100.0,1100.0,1100.0,1100.0,1100.0
mean,57012.260669,0.009271,4.679796,8.74821,291.657299,1.462937e-06
std,2438.752107,0.00025,0.051565,0.119011,0.040372,5.806463e-08
min,49513.133,0.008458,4.579077,7.992932,291.5897,1.24196e-06
25%,56205.678,0.009103,4.650512,8.645938,291.63888,1.424798e-06
50%,56873.396,0.009294,4.660717,8.773491,291.6533,1.465354e-06
75%,59101.5625,0.009498,4.684794,8.834388,291.66037,1.508713e-06
max,61500.453,0.009791,4.849904,8.926567,291.8098,1.607755e-06


### Check for correlations

Initially only the top n selected features are chosen. 

In [85]:
#number of variables for heatmap
k = 15
features = [
    "TMP_average",
    "flow_feed",
    "conductivity",
    'RTS',
    "temp_average",
    "flux",
]   

# Get top k largest parmaters that are correlated 
# These will be used later on as headers for other plots 
# Observe strong correlated features wrt Average pressure, APT02 (bar)
target_feature = 'flux'
cols = abs(data[features].corr()).nlargest(k, target_feature)
#print(cols)

In [86]:
data[features].corr()

Unnamed: 0,TMP_average,flow_feed,conductivity,RTS,temp_average,flux
TMP_average,1.0,0.83402,-0.609528,0.59044,-0.832074,0.788026
flow_feed,0.83402,1.0,-0.420403,0.285004,-0.606652,0.936443
conductivity,-0.609528,-0.420403,1.0,-0.613283,0.653598,-0.43393
RTS,0.59044,0.285004,-0.613283,1.0,-0.500498,0.282674
temp_average,-0.832074,-0.606652,0.653598,-0.500498,1.0,-0.579578
flux,0.788026,0.936443,-0.43393,0.282674,-0.579578,1.0


In [87]:
fig = px.imshow(data[features].corr(), text_auto=True, aspect="auto", color_continuous_scale='Blues')
fig.update_traces(hovertemplate=None)
fig.update_layout(hovermode="x")
fig.layout.yaxis.gridcolor = "#DBDDDB"
fig.layout.xaxis.gridcolor = "#DBDDDB"
fig.layout.yaxis.linecolor = "#000000"
fig.layout.xaxis.linecolor = "#000000"
fig.layout.plot_bgcolor = "#ffffff"
fig.show()

### Initial Model Evaluation

Various models are tested and saved in a pd.DataFrame


In [88]:
#########
# Initialize a scaler, then apply it to the features
#scaler = RobustScaler()
#scaler = MinMaxScaler()
scaler = StandardScaler()

features = ["TMP_average", "flow_feed", "conductivity", 'RTS', "temp_average"]

X = data[features]

target_feature = 'flux'

#y = scaler.fit_transform(data[target_feature])
y = data[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

fit_scaled = False

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf_A =  DecisionTreeRegressor()
clf_B =  RandomForestRegressor()
clf_C = Lasso()
clf_D = LinearRegression()

samples_100 = len(y_train)
samples_80 = int(len(y_train)*80/100)
samples_50 = int(len(y_train)*50/100)

# Collect results on the learners
results = []
pred_test = []
pred_train = []
#for clf in [clf_A, clf_B, clf_C, clf_D, clf_E]:
for clf in [clf_A, clf_D]:
    
    clf_name = clf.__class__.__name__
    
    for i, samples in enumerate([samples_50, samples_80, samples_100]):
        if fit_scaled:
            X_train = X_train_scaled
            X_test = X_test_scaled
            
        
        temp = train_predict(clf, samples, X_train, y_train, X_test, y_test,'micro')
        results.append(temp)

df_results = pd.DataFrame(results)
df_results.sort_values("r2_test", ascending=False)

Training set has 770 samples.
Testing set has 330 samples.
DecisionTreeRegressor trained on 385 samples.
DecisionTreeRegressor trained on 616 samples.
DecisionTreeRegressor trained on 770 samples.
LinearRegression trained on 385 samples.
LinearRegression trained on 616 samples.
LinearRegression trained on 770 samples.


Unnamed: 0,model,model_learner,sample_size,train_time,pred_time,r2_train,r2_test,mse_train,mse_test,predictions_test,predictions_train
5,LinearRegression,LinearRegression(),770,0.001457,0.002451,0.886545,0.860385,3.875384e-16,4.548097e-16,"[1.4266211159820056e-06, 1.5131191483703274e-0...","[1.3240340865278215e-06, 1.404017968435881e-06..."
4,LinearRegression,LinearRegression(),616,0.002,0.00235,0.886082,0.859286,3.891212e-16,4.583893e-16,"[1.4260718731261094e-06, 1.5140189495240985e-0...","[1.32550198417024e-06, 1.406988259622831e-06, ..."
3,LinearRegression,LinearRegression(),385,0.003316,0.002861,0.885165,0.858457,3.922518e-16,4.610913e-16,"[1.426445298722432e-06, 1.5140961534286499e-06...","[1.3308550233840738e-06, 1.4090379530874904e-0..."
0,DecisionTreeRegressor,DecisionTreeRegressor(),385,0.003978,0.002009,0.888857,0.771623,3.796417e-16,7.439623e-16,"[1.4265437010962357e-06, 1.5415023279405876e-0...","[1.339901351307653e-06, 1.4125866597695966e-06..."
1,DecisionTreeRegressor,DecisionTreeRegressor(),616,0.002998,0.002004,0.92704,0.746554,2.49218e-16,8.256274e-16,"[1.42559871487119e-06, 1.5044732245722172e-06,...","[1.3376856610625154e-06, 1.3913560772466072e-0..."
2,DecisionTreeRegressor,DecisionTreeRegressor(),770,0.003996,0.002002,0.955145,0.743504,1.532172e-16,8.355626e-16,"[1.4242774089368822e-06, 1.4997109076263174e-0...","[1.340446290988662e-06, 1.4069910961863705e-06..."


## Select id to plot

In [89]:
id = 5
model_name = df_results["model"][id]
r2 = df_results["r2_test"][id]

y_predict = df_results.predictions_test[id]
residuals = y_test-y_predict
per_error = residuals/y_test*100
fig = px.scatter(x=y_test, y=y_predict, width=700, height=700, marginal_x="box", marginal_y="box")
fig.layout.title = f"Parity Plaot for {model_name} model with an r2 of {r2:0.4f}"
fig.layout.title.font.size = 16
fig.layout.title.font.size = 16
fig.layout.yaxis.title = "Predicted flux [m3/m2/s]"
fig.layout.xaxis.title = "Actual flux [m3/m2/s]"
fig.layout.yaxis.gridcolor = "#DBDDDB"
fig.layout.xaxis.gridcolor = "#DBDDDB"
fig.layout.yaxis.linecolor = "#000000"
fig.layout.xaxis.linecolor = "#000000"
fig.layout.plot_bgcolor = "#ffffff"
fig.show()

In [90]:
fig = px.histogram( x=residuals, width=700, height=700)
fig.layout.title = f"Histogram of residuals for {model_name} with an r2 of {r2:0.4f}"
fig.layout.title.font.size = 16
fig.layout.title.font.size = 16
fig.layout.yaxis.title = "Count"
fig.layout.xaxis.title = "Residuals"
fig.layout.yaxis.gridcolor = "#DBDDDB"
fig.layout.xaxis.gridcolor = "#DBDDDB"
fig.layout.yaxis.linecolor = "#000000"
fig.layout.xaxis.linecolor = "#000000"
fig.layout.plot_bgcolor = "#ffffff"
fig.show()

## Test model

In [91]:
# Inputs

input_values = {
"TMP_average": 401082.437485,
"flow_feed":0.030384,
"conductivity":5.778287,
'RTS':22.395956,
"temp_average":301.890964,}


df_inputs = pd.DataFrame([input_values])

df_inputs_scaled = scaler.transform(df_inputs)

prediction = df_results.model_learner[id].predict(df_inputs)


print(f"{prediction[0]} ")

6.307264625500475e-06 


## Convert and save the model

In [92]:
# Perform prerequisites for model conversion

model = df_results.model_learner[id]
input_types = [("input",FloatTensorType([1,len(features)]))]
output_types = [('output', FloatTensorType([1, 1]))]

file_name = f"uf_{model_name}_1.model"

assert file_name.endswith(".model")

# Convert the model
# A target_opset of {'': 19, 'ai.onnx.ml': 3} has worked before, if options provided break. Otherwise setting target_opset to `None` could fix the issue
onnx_model = convert_sklearn(
    model,
    initial_types=input_types,
    final_types=output_types,
    target_opset=9
    
)


onnx_file_name = file_name.split(".")[0]+".onnx"
path = "models/"
onnx_file_path = path+onnx_file_name
save_onnx_model(onnx_model, onnx_file_path)
print(f"The model has been saved to '{onnx_file_path}'")


The model has been saved to 'models/uf_LinearRegression_1.onnx'


## Test onnx model

In [93]:
# Load the onnx model
sess = rt.InferenceSession(onnx_file_path, providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

# Setup input values
input_values = {
    "TMP_average": 401082.437485,
    "flow_feed":0.030384,
    "conductivity":5.778287,
    'RTS':22.395956,
    "temp_average":301.890964,
}


input_data = np.matrix([[input_values[feature] for feature in features]])

# Do the actual prediction
pred_onnx = sess.run([label_name], {input_name: input_data.astype(np.float32)})
print(pred_onnx[0][0][0])

6.307264e-06


In [94]:
# Get the model's graph
graph = onnx_model.graph

# Print input names and their types
for input in graph.input:
    print(f"Input name: {input.name}, Type: {input.type}")

for output in graph.output:
    print(f"output name: {output.name}, Type: {output.type}")

Input name: input, Type: tensor_type {
  elem_type: 1
  shape {
    dim {
      dim_value: 1
    }
    dim {
      dim_value: 5
    }
  }
}

output name: output, Type: tensor_type {
  elem_type: 1
  shape {
    dim {
      dim_value: 1
    }
    dim {
      dim_value: 1
    }
  }
}



## Alternate export

In [95]:
## Working

file_name = f"uf_{model_name}_2.onnx"
# Define the initial type for the model
input_types = [('input', FloatTensorType([None, X_train_scaled.shape[1]]))]
output_types = [('output', FloatTensorType([None, 1]))]



# Convert the model to ONNX format
onnx_model = skl2onnx.convert_sklearn(model, initial_types=input_types, final_types=output_types,target_opset=9)

path = "models/"
onnx_file_path = path+file_name
# Save the model to a file
with open(onnx_file_path, "wb") as f:
    f.write(onnx_model.SerializeToString())
print(f"The model has been saved to '{onnx_file_path}'")

The model has been saved to 'models/uf_LinearRegression_2.onnx'


In [96]:
# Load the onnx model
sess = rt.InferenceSession(onnx_file_path, providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

# Setup input values
input_values = {
    "TMP_average": 401082.437485,
    "flow_feed":0.030384,
    "conductivity":5.778287,
    'RTS':22.395956,
    "temp_average":301.890964,
}

input_data = np.matrix([[input_values[feature] for feature in features]])

# Do the actual prediction
pred_onnx = sess.run([label_name], {input_name: input_data.astype(np.float32)})
#pred_onnx = sess.run([label_name], {input_name: input_data})
print(pred_onnx)

[array([[6.307264e-06]], dtype=float32)]


In [97]:
# Get the model's graph
graph = onnx_model.graph

# Print input names and their types
for input in graph.input:
    print(f"Input name: {input.name}, Type: {input.type}")

for output in graph.output:
    print(f"output name: {output.name}, Type: {output.type}")

Input name: input, Type: tensor_type {
  elem_type: 1
  shape {
    dim {
    }
    dim {
      dim_value: 5
    }
  }
}

output name: output, Type: tensor_type {
  elem_type: 1
  shape {
    dim {
    }
    dim {
      dim_value: 1
    }
  }
}

