###  [RQ1] Goal : What encoding should we use? Is it better to use tristate features or dummies?

##### I- Tristate features:

<i>a-] With the current encoding</i>

- 'no' = 0,
- 'yes' = 1
- 'module' = 2

<i>b-] Considering the module values like 'soft yes'</i>

- 'no' = 0,
- 'module' = 1
- 'yes' = 2

<i>c-] Considering the module values like 'soft no'</i>

- 'yes' = 0,
- 'module' = 1
- 'no' = 2

#### II - Dummies :

<i>a-] 'module' $<=>$ 'no'</i>

- 'yes' = 1
- 'no' or 'module' = 0

<i>b-] 'module' $<=>$ 'yes'</i>

- 'yes' or 'module' = 1
- 'no' = 0

## Results => best : II- b-] Dummy yes vs not yes

#### Tristate's results; yes values contains more information about kernel sizes

#### Dummies' results : if we have to replace the module values with either yes or no, it's definitely no

#### Import

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tuxml

#### Test function

Best hyperparameters found - best neural network architecture

90% of the dataset in the training set


In [2]:
def compute_net(features, batch_size = 50, nb_epochs = 30, nb_node_layer1 = 200, nb_node_layer2 = 100, lr1 = 0.5, lr2 = 0.025):
    
    n = 65000
    sizes = np.array(features[0:n]['vmlinux'])
    x_train, x_test, y_train, y_test = train_test_split(features.drop('vmlinux', axis=1)[0:n], sizes, test_size = 0.1)

    nbCol = len(features.columns)

    x_train = np.array(x_train, dtype = np.float32)
    x_test =  np.array(x_test, dtype = np.float32)

    y_train = np.array(y_train, dtype = np.float32)
    y_test = np.array(y_test, dtype = np.float32)

    nb_features = x_train.shape[1]
    
    #     batch_size = int(x[0])
    #     nb_epochs = int(x[1])
    #     nb_node_layer1 = int(x[2])
    #     nb_node_layer2 = int(x[3])
    #     lr1 = x[4]
    #     lr2 = x[5]
    
    nb_batch_train = int(len(x_train)/batch_size)-1
    nb_batch_test = int(len(x_test)/batch_size)
    
    # slice the datasets => feed_dict was very slow, so I choose an iterator solution
    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
    iterator_train = tf.compat.v1.data.make_initializable_iterator(dataset_train)
    xtr, ytr = iterator_train.get_next()

    dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
    iterator_test = tf.compat.v1.data.make_initializable_iterator(dataset_test)
    xte, yte = iterator_test.get_next()

    with tf.device("/gpu:0"):
        # Layers training
        w_h1_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_features, nb_node_layer1)), name = "w_h1_tr")
        mat_h1_tr = tf.matmul(xtr, w_h1_tr)
        b_h1_tr = tf.Variable(tf.zeros(nb_node_layer1), name="b_h1_tr")
        out_h1_tr = tf.nn.relu(tf.add(mat_h1_tr, b_h1_tr))

        w_h2_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer1, nb_node_layer2)), name = "w_h2_tr")
        mat_h2_tr = tf.matmul(out_h1_tr, w_h2_tr)
        b_h2_tr = tf.Variable(tf.zeros(nb_node_layer2), name="b_h2_tr")
        out_h2_tr = tf.nn.relu(tf.add(mat_h2_tr, b_h2_tr))

        w_final_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer2, 1)), name = "w_final_tr")
        outputs_tr = tf.reshape(tf.matmul(out_h2_tr, w_final_tr), shape=[batch_size])
        ytr = tf.reshape(ytr, [batch_size])

        # Layers test
        mat_h1_te = tf.matmul(xte, w_h1_tr)
        out_h1_te = tf.nn.relu(tf.add(mat_h1_te, b_h1_tr))

        mat_h2_te = tf.matmul(out_h1_te, w_h2_tr)
        out_h2_te = tf.nn.relu(tf.add(mat_h2_te, b_h2_tr))

        outputs_te =  tf.reshape(tf.matmul(out_h2_te, w_final_tr), shape=[batch_size])
        yte = tf.reshape(yte, [batch_size])

        # Cost => MAPE
        train_cost = tf.keras.losses.MAPE(ytr, outputs_tr)
        test_cost = tf.keras.losses.MAPE(yte, outputs_te)

        # Convergence function => AdamOptimizer
        train_step = tf.train.AdamOptimizer(learning_rate=lr1).minimize(train_cost)

        # train step with a lower learning rate => gain few % at the end
        tiny_train_step = tf.train.AdamOptimizer(learning_rate=lr2).minimize(train_cost)

        # allocate memory for tensors
        init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        for j in range(nb_epochs):
            if j < 20:
                sess.run(iterator_train.initializer)
                for i in range(nb_batch_train):
                    val = sess.run(train_step)
                #print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
            else:
                sess.run(iterator_train.initializer)
                for i in range(nb_batch_train):
                    val = sess.run(tiny_train_step)
                #print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
        mape = 0
        sess.run(iterator_train.initializer)
        for i in range(nb_batch_train):
            mape+=sess.run(train_cost)
        mape = 0
        sess.run(iterator_test.initializer)
        for i in range(nb_batch_test):
            mape+=sess.run(test_cost)
    return mape/nb_batch_test

### I- Tristate features:

#### <i>a-] With the current encoding</i>

- 'no' = 0,
- 'yes' = 1
- 'module' = 2

In [3]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection]

res_tristate = []
for i in range(5):
    res_tristate.append(compute_net(features))

mape1a = np.mean(res_tristate)
print("MAPE I- a] :", mape1a)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
MAPE I- a] : 8.541988591414231


#### <i>b-] Considering the module values like 'soft yes'</i>

- 'no' = 0,
- 'module' = 1
- 'yes' = 2

In [4]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([1,2,-1],[2,1,0])


res_tristate2 = []
for i in range(5):
    res_tristate2.append(compute_net(features))

mape1b = np.mean(res_tristate2)
print("MAPE I- b] :", mape1b)

MAPE I- b] : 6.78992106290964


#### <i>c-] Considering the module values like 'soft no'</i>

- 'yes' = 0,
- 'module' = 1
- 'no' = 2

In [7]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([0,1,2,-1],[2,0,1,2])


res_tristate3 = []
for i in range(5):
    res_tristate3.append(compute_net(features))

mape1c = np.mean(res_tristate3)
print("MAPE I- c] :", mape1c)

MAPE I- c] : 9.194793124565711


### II - Dummies :

####  <i>a-] 'module' $<=>$ 'no'</i>

- 'yes' = 1
- 'no' or 'module' = 0

In [5]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1],0)


res_dummy = []
for i in range(5):
    res_dummy.append(compute_net(features))

mape2a = np.mean(res_dummy)
print("MAPE II- a] :", mape2a)

MAPE II- a] : 5.939016503554124


#### <i>b-] 'module' $<=>$ 'yes'</i>

- 'yes' or 'module' = 1
- 'no' = 0

In [6]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2, -1], [1, 0])

res_dummy2 = []
for i in range(5):
    res_dummy2.append(compute_net(features))

mape2b = np.mean(res_dummy2)
print("MAPE II- a] :", mape2b)

MAPE II- a] : 8.345303451831523
