In [5]:
import tensorflow as tf
import numpy as np
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import edgetpu.conversion_tf as conv
from hummingbird.ml import convert

In [39]:
x = tf.constant([5, 5, 5, 5, 5, 5, 5, 5], shape=[1,8])
y = tf.constant([1, 2, 3, 4, 5, 6, 7, 8], shape=[1,8])

In [40]:
A = tf.constant([[1, 1, 1], [2, 2, 2], [3, 3, 3]], shape=[3,3])
B = tf.constant([[1, 2, 3], [1, 2, 3], [1, 2, 3]], shape=[3,3])

A_t = torch.tensor([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
B_t = torch.tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3]])

In [41]:
tf.multiply(A_t, tf.transpose(B_t))

<tf.Tensor: shape=(3, 3), dtype=int64, numpy=
array([[1, 1, 1],
       [4, 4, 4],
       [9, 9, 9]])>

In [42]:
tf.multiply(A, tf.transpose(B))

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 1, 1],
       [4, 4, 4],
       [9, 9, 9]], dtype=int32)>

In [43]:
tf.less_equal(A, B_t)

<tf.Tensor: shape=(3, 3), dtype=bool, numpy=
array([[ True,  True,  True],
       [False,  True,  True],
       [False, False,  True]])>

In [44]:
C = tf.constant([[1, 1], [2, 2], [3, 3]], shape=[3, 2])

In [45]:
C

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1, 1],
       [2, 2],
       [3, 3]], dtype=int32)>

In [46]:
for i, row in enumerate(A):
    for j, col in enumerate(tf.transpose(C)):
        added = tf.multiply(row, col)
        val = tf.reduce_sum(added)
        print(f'({i},{j}): {val}')

(0,0): 6
(0,1): 6
(1,0): 12
(1,1): 12
(2,0): 18
(2,1): 18


In [7]:
def representative_dataset():
    for _ in range(100):
      data = np.random.uniform(low=0., high=8., size=(1,8))
      yield [data.astype(np.float32)]
 
tf.config.run_functions_eagerly(True)

forest = RandomForestClassifier(n_estimators=105)
X, y = make_classification(n_samples=1300, n_features=8,
                           n_informative=4, n_redundant=1,
                           random_state=0, shuffle=True,
                           n_classes=4)

x_train, y_train = X[:1000], y[:1000]
x_test, y_test = X[1000:], y[1000:]

forest.fit(x_train, y_train)

X = tf.constant([1, 2, 3, 4, 5, 6, 7, 8], shape=[1, 8], dtype=tf.int32)
X_float = tf.constant([1., 2., 3., 4., 5., 6., 7., 8.], shape=[1, 8])
X_8 = tf.constant([1, 2, 3, 4, 5, 6, 7, 8], shape=[1, 8], dtype=tf.int8)

2023-03-20 11:27:27.005487: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-03-20 11:27:27.005606: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (XPS-9500): /proc/driver/nvidia/version does not exist
2023-03-20 11:27:27.013078: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [48]:
conv_model = convert(forest, 'torch', extra_config={"tree_implementation":"gemm"})

In [49]:
op = conv_model.model._operators[0]

In [6]:
model_gemm = conv.GEMMDecisionTreeImpl(forest)

y_mod_pred_gemm, y_mod_gemm = conv_model.model._operators[0].forward((torch.tensor([[1., 2., 3., 4., 5., 6., 7., 8.]])))

NameError: name 'forest' is not defined

In [51]:
y_mod_gemm

tensor([[0.2952, 0.4190, 0.0286, 0.2571]], grad_fn=<TBackward0>)

In [52]:
y_pred_gemm, y_gemm = model_gemm(X_float)

In [53]:
y_gemm

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.2952381 , 0.41904762, 0.02857143, 0.25714287]], dtype=float32)>

In [54]:
tf.cast(y_gemm / 100, tf.uint8)

<tf.Tensor: shape=(1, 4), dtype=uint8, numpy=array([[0, 0, 0, 0]], dtype=uint8)>

### Testing casted __call__ run

In [55]:
x = tf.transpose(tf.constant([[1,2,3,4,5,6,7,8]], dtype=tf.int8))
xf = tf.transpose(tf.constant([[1.,2.,3.,4.,5.,6.,7.,8.]], dtype=tf.float32))
w1 = tf.cast(op.weight_1.detach(), tf.int8) 
b1 = tf.cast(op.bias_1.detach() * 10, tf.int8)

In [56]:
x = tf.less_equal(np.matmul(w1, x) * 10, b1)

In [57]:
xf = tf.less_equal(tf.linalg.matmul(op.weight_1.detach(), xf), op.bias_1.detach())

By mulitplying both sides by 10 we prevent values like 0.2 to lose their information by being rounded to 0 .This way we managed to make the comparisons equivalent but its now int32 based

In [58]:
x == xf

<tf.Tensor: shape=(21525, 1), dtype=bool, numpy=
array([[ True],
       [ True],
       [ True],
       ...,
       [ True],
       [ True],
       [ True]])>

In [59]:
x = tf.cast(x, tf.uint8)
xf = tf.cast(xf, tf.float32)

In [60]:
tf.math.less_equal(w1 * 10, b1) == tf.math.less_equal(op.weight_1.detach(), op.bias_1.detach())

<tf.Tensor: shape=(21525, 8), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])>

In [61]:
x = tf.reshape(x, (op.n_trees, op.hidden_one_size, -1))
xf = tf.reshape(xf, (op.n_trees, op.hidden_one_size, -1))

In [62]:
w2 = tf.cast(op.weight_2.detach(), tf.uint8)
x = np.matmul(w2, x)
xf = tf.linalg.matmul(op.weight_2.detach(), xf)
x == tf.cast(xf, tf.int32)

<tf.Tensor: shape=(105, 206, 1), dtype=bool, numpy=
array([[[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       ...,

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]]])>

In [63]:
x = tf.reshape(x, (op.n_trees * op.hidden_two_size, -1)) == tf.cast(op.bias_2.detach(), tf.uint8)
xf = tf.reshape(xf, (op.n_trees * op.hidden_two_size, -1)) == op.bias_2.detach()

In [64]:
x = tf.reshape(x, (op.n_trees, op.hidden_two_size, -1))
xf = tf.reshape(xf, (op.n_trees, op.hidden_two_size, -1))

In [65]:
x = tf.cast(x, dtype=tf.uint8)
xf = tf.cast(xf, dtype=tf.float32)

In [66]:
x == tf.cast(xf, tf.uint8)

<tf.Tensor: shape=(105, 206, 1), dtype=bool, numpy=
array([[[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       ...,

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]],

       [[ True],
        [ True],
        [ True],
        ...,
        [ True],
        [ True],
        [ True]]])>

In [67]:
tf.cast(op.weight_3.detach() * 10000, tf.uint8)

<tf.Tensor: shape=(105, 4, 206), dtype=uint8, numpy=
array([[[ 0,  0,  0, ...,  0,  0,  0],
        [95,  0,  0, ...,  0,  0,  0],
        [ 0, 95, 95, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0, 95, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [95,  0,  0, ...,  0,  0,  0],
        [ 0, 95,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [95,  0,  0, ...,  0,  0,  0],
        [ 0,  0, 95, ...,  0,  0,  0],
        [ 0, 95,  0, ...,  0,  0,  0]],

       ...,

       [[95, 95,  0, ...,  0,  0,  0],
        [ 0,  0, 95, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0,  0,  0, ...,  0,  0,  0],
        [95,  0,  0, ...,  0,  0,  0],
        [ 0, 95, 95, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]],

       [[ 0, 95,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0, 95, ...,  0,  0,  0],
        [95,  0,  0, ...,  

In [68]:
concrete_func = model_gemm.__call__.get_concrete_function()

In [69]:
converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func], model_gemm)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  
converter.inference_output_type = tf.int8
converter.allow_custom_ops = True

In [70]:
tflite_model_gemm = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmpire21tpy/assets


2023-02-07 19:33:16.493767: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2023-02-07 19:33:16.493830: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2023-02-07 19:33:16.494038: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpire21tpy
2023-02-07 19:33:16.507849: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2023-02-07 19:33:16.507891: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: /tmp/tmpire21tpy
2023-02-07 19:33:16.560733: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2023-02-07 19:33:16.647107: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: /tmp/tmpire21tpy
2023-02-07 19:33:16.738874: I tensorflow/cc/saved_model/loader.cc:305] SavedModel load for tags { serve }; Status: success: OK. Took 244840 

Estimated count of arithmetic ops: 9.386 M  ops, equivalently 4.693 M  MACs
Estimated count of arithmetic ops: 9.386 M  ops, equivalently 4.693 M  MACs


fully_quantize: 0, inference_type: 6, input_inference_type: INT8, output_inference_type: INT8
2023-02-07 19:33:18.307182: I tensorflow/compiler/mlir/lite/flatbuffer_export.cc:1989] Estimated count of arithmetic ops: 9.386 M  ops, equivalently 4.693 M  MACs



In [71]:
interpreter = tf.lite.Interpreter(model_content=tflite_model_gemm)
interpreter.allocate_tensors()

output = interpreter.get_output_details()[0]  
input = interpreter.get_input_details()[0]  


interpreter.set_tensor(input['index'], X_8)
interpreter.invoke()
y_lite_gemm = interpreter.get_tensor(output['index'])

y_pred_lite_gemm = np.argmax(y_lite_gemm, axis=1)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [72]:
y_lite_gemm

array([[ 98,  94, 123,  92]], dtype=int8)

In [73]:
with open('../saved_models/random_forest/int32/model.tflite', 'wb') as f:
    f.write(tflite_model_gemm)

# Current stats
### GEMM model
name | value |
|:---------|:---------|
Input model| model.tflite 
Input size | 4.87MiB 
Output model | model_edgetpu.tflite 
Output size | 19.75MiB 
On-chip memory used for caching model parameters | 7.62MiB 
On-chip memory remaining for caching model parameters | 259.75KiB 
Off-chip memory used for streaming uncached model parameters | 9.74MiB 
Number of Edge TPU subgraphs | 1 
Total number of operations | 437 
Number of operations that will run on Edge TPU | 212 
Number of operations that will run on CPU | 225 
