In [1]:
import pandas as pd
from joblib import dump

from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
url = 'https://raw.githubusercontent.com/koaning/onnx-demo/main/clinc_oos-plus.csv'
df = pd.read_csv(url)
df

Unnamed: 0,text,label,split
0,what expression would i use to say i love you ...,61,train
1,can you tell me how to say 'i do not speak muc...,61,train
2,"what is the equivalent of, 'life is good' in f...",61,train
3,"tell me how to say, 'it is a beautiful morning...",61,train
4,"if i were mongolian, how would i say that i am...",61,train
...,...,...,...
20745,find my wallet,42,valid
20746,can you give me the gps location of harvey,42,valid
20747,where's my buddy steve right this second,42,valid
20748,locate jenny at her present position,42,valid


In [3]:
X = df.text.to_list()
y = df.label

pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression()
)
pipe.fit(X,y)

dump(pipe, '7_save.joblib')

['7_save.joblib']

In [4]:
from joblib import load
trained = load('7_save.joblib')
trained.predict(['hello'])

array([82], dtype=int64)

In [5]:
class Evil:
    def predict(self, X):
        return [1 for _ in X]

In [6]:
evil_pipe = Evil()

In [7]:
dump(evil_pipe, '7_save_evil.joblib')

['7_save_evil.joblib']

In [8]:
pipe_loaded = load('7_save_evil.joblib')
pipe_loaded.predict(['hello'])

[1]

In [9]:
import hashlib

def calc_checksum(path):
    md5_hash = hashlib.md5()
    with open(path, 'rb') as f:
        content = f.read()
    md5_hash.update(content)
    digest = md5_hash.hexdigest()
    print(digest)

calc_checksum('7_save.joblib')
calc_checksum('7_save_evil.joblib')

3239ec92e23ec516a86bcf906c55b9eb
92315453ee4dcaf50814e4548622acd3


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_wine

X,y = load_wine(return_X_y=True)
clf = LogisticRegression(max_iter=10_000)

prop_before = dir(clf)
clf.fit(X,y)
prop_after = dir(clf)

[e for e in prop_after if e not in prop_before]

['classes_', 'coef_', 'intercept_', 'n_features_in_', 'n_iter_']

In [11]:
clf.intercept_

array([-16.65933456,  22.52722423,  -5.86788967])

In [12]:
import h5py

def save_coef(classifier, filename):
    with h5py.File(filename, 'w') as hf:
        hf.create_dataset('coef', data=classifier.coef_)
        hf.create_dataset('intercept', data=classifier.intercept_)
        hf.create_dataset('classes', data=classifier.classes_)

def load_coef(classifier, filename):
    with h5py.File(filename, 'r') as hf:
        coef = hf['coef'][:]
        intercept = hf['intercept'][:]
        classes = hf['classes'][:]
    classifier.coef_ = coef
    classifier.intercept_ = intercept
    classifier.classes_ = classes

In [13]:
save_coef(clf, '7_clf.h5')

In [14]:
lr = LogisticRegression()
load_coef(lr, '7_clf.h5')
lr.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [15]:
pipe

In [16]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

initial_types = [('text_input', StringTensorType([None, 1]))]
onx = convert_sklearn(pipe, initial_types=initial_types)

with open('7_onnx.onnx', 'wb') as f:
    f.write(onx.SerializeToString())

In [17]:
import numpy as np
import onnxruntime as rt

session = rt.InferenceSession('7_onnx.onnx')
input_name = session.get_inputs()[0].name

query = 'example'
_, probas = session.run(None, {input_name: np.array([[query]])})
probas[0]

{0: 0.004739089403301477,
 1: 0.0038543727714568377,
 2: 0.00090967578580603,
 3: 0.0005303092184476554,
 4: 0.0021704065147787333,
 5: 0.007713638246059418,
 6: 0.0008044824353419244,
 7: 0.0012734602205455303,
 8: 0.0023176090326160192,
 9: 0.005703362636268139,
 10: 0.010597948916256428,
 11: 0.0014304597862064838,
 12: 0.0017885735724121332,
 13: 0.012635118328034878,
 14: 0.008710585534572601,
 15: 0.0015316539211198688,
 16: 0.007715681102126837,
 17: 0.0018564617494121194,
 18: 0.003475870005786419,
 19: 0.0027919479180127382,
 20: 0.004640719387680292,
 21: 0.0025059967301785946,
 22: 0.0035873972810804844,
 23: 0.00037493507261388004,
 24: 0.0018336804350838065,
 25: 0.021399937570095062,
 26: 0.005533374845981598,
 27: 0.0013260310515761375,
 28: 0.0014045751886442304,
 29: 0.005757700651884079,
 30: 0.03491854667663574,
 31: 0.004703733138740063,
 32: 0.00686904601752758,
 33: 0.007320633623749018,
 34: 0.0030579655431210995,
 35: 0.0017219164874404669,
 36: 0.01048553362488