## Goal : Should we keep nbyes/nbno in the list of features?

We have to keep in mind that it must be checked with interpretables algorithms.

## Results =>  It does not change the results

### Maybe they are important in the random forest algorithms because they are way bigger than other features?

### Try to feed the trees with proportions instead of counts?

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import tuxml

Simple comparison on 10 launches:
- with the nbyes options
- without the nbno options

Conclusions:
- If the test MAPE is significatively under with nbyes/nbno, then we have to keep it.
- If the test MAPE does not change with/without nbyes/nbno, it does not have so much importance (at least for neural networks)
- If the test MAPE is significatively under without nbyes/nbno, then we have to drop it.

## With

In [3]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net_with_nb.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1],0)
nb_features = 12611
to_normalize = ['nbyes', 'nbno', 'nbmodule', 'nbyesmodule']
for feat in to_normalize:
    if feat in features.columns:
        features[feat] = features[feat]/nb_features

In [4]:
n = 65500
sizes = np.array(features[0:n]['vmlinux'])
x_train, x_test, y_train, y_test = train_test_split(features.drop('vmlinux', axis=1)[0:n], sizes, test_size = 0.2)

nbCol = len(features.columns)

x_train = np.array(x_train, dtype = np.float32)
x_test =  np.array(x_test, dtype = np.float32)

y_train = np.array(y_train, dtype = np.float32)
y_test = np.array(y_test, dtype = np.float32)

In [7]:
nb_features = x_train.shape[1]
batch_size = 100
nb_epochs = 40
nb_batch_train = int(len(x_train)/batch_size)-1
nb_batch_test = int(len(x_test)/batch_size)
nb_node_layer1 = 200
nb_node_layer2 = 100

res = []

for i in range(10):
    # slice the datasets => feed_dict was very slow, so I switch to an iterator solution
    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
    iterator_train = tf.compat.v1.data.make_initializable_iterator(dataset_train)
    xtr, ytr = iterator_train.get_next()

    dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
    iterator_test = tf.compat.v1.data.make_initializable_iterator(dataset_test)
    xte, yte = iterator_test.get_next()

    with tf.device("/gpu:0"):
        # Layers training
        w_h1_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_features, nb_node_layer1)), name = "w_h1_tr")
        mat_h1_tr = tf.matmul(xtr, w_h1_tr)
        b_h1_tr = tf.Variable(tf.zeros(nb_node_layer1), name="b_h1_tr")
        out_h1_tr = tf.nn.relu(tf.add(mat_h1_tr, b_h1_tr))

        w_h2_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer1, nb_node_layer2)), name = "w_h2_tr")
        mat_h2_tr = tf.matmul(out_h1_tr, w_h2_tr)
        b_h2_tr = tf.Variable(tf.zeros(nb_node_layer2), name="b_h2_tr")
        out_h2_tr = tf.nn.relu(tf.add(mat_h2_tr, b_h2_tr))

        w_final_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer2, 1)), name = "w_final_tr")
        outputs_tr =  tf.reshape(tf.matmul(out_h2_tr, w_final_tr), shape=[batch_size])
        ytr = tf.reshape(ytr, [batch_size])

        # Layers test
        mat_h1_te = tf.matmul(xte, w_h1_tr)
        out_h1_te = tf.nn.relu(tf.add(mat_h1_te, b_h1_tr))

        mat_h2_te = tf.matmul(out_h1_te, w_h2_tr)
        out_h2_te = tf.nn.relu(tf.add(mat_h2_te, b_h2_tr))

        outputs_te =  tf.reshape(tf.matmul(out_h2_te, w_final_tr), shape=[batch_size])
        yte = tf.reshape(yte, [batch_size])

        # Cost => MAPE
        train_cost = tf.keras.losses.MAPE(ytr, outputs_tr)
        test_cost = tf.keras.losses.MAPE(yte, outputs_te)

        # Convergence function
        train_step = tf.train.AdamOptimizer(learning_rate=1).minimize(train_cost)

        # train step with a lower learning rate => gain few % at the end
        tiny_train_step = tf.train.AdamOptimizer(learning_rate=0.01).minimize(train_cost)

        # allocate memory for tensors
        init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        for j in range(nb_epochs):
            if j < 20:
                sess.run(iterator_train.initializer)
                for i in range(nb_batch_train):
                    val = sess.run(train_step)
                #print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
            else:
                sess.run(iterator_train.initializer)
                for i in range(nb_batch_train):
                    val = sess.run(tiny_train_step)
                #print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
        sess.run(iterator_train.initializer)
        mape = 0
        for i in range(nb_batch_train):
            mape+=sess.run(train_cost)
        #print("Cout entrainement final =", mape/nb_batch_train)
        mape = 0
        sess.run(iterator_test.initializer)
        for i in range(nb_batch_test):
            mape+=sess.run(test_cost)
        res.append(mape/nb_batch_test)
print("With :", np.mean(res))

With : 6.229020699653916


## Without

In [8]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1],0)

In [9]:
n = 65500
sizes = np.array(features[0:n]['vmlinux'])
x_train, x_test, y_train, y_test = train_test_split(features.drop('vmlinux', axis=1)[0:n], sizes, test_size = 0.2)

nbCol = len(features.columns)

x_train = np.array(x_train, dtype = np.float32)
x_test =  np.array(x_test, dtype = np.float32)

y_train = np.array(y_train, dtype = np.float32)
y_test = np.array(y_test, dtype = np.float32)

In [10]:
nb_features = x_train.shape[1]
batch_size = 100
nb_epochs = 40
nb_batch_train = int(len(x_train)/batch_size)-1
nb_batch_test = int(len(x_test)/batch_size)
nb_node_layer1 = 200
nb_node_layer2 = 100

res = []

for i in range(10):
    # slice the datasets => feed_dict was very slow, so I switch to an iterator solution
    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
    iterator_train = tf.compat.v1.data.make_initializable_iterator(dataset_train)
    xtr, ytr = iterator_train.get_next()

    dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
    iterator_test = tf.compat.v1.data.make_initializable_iterator(dataset_test)
    xte, yte = iterator_test.get_next()

    with tf.device("/gpu:0"):
        # Layers training
        w_h1_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_features, nb_node_layer1)), name = "w_h1_tr")
        mat_h1_tr = tf.matmul(xtr, w_h1_tr)
        b_h1_tr = tf.Variable(tf.zeros(nb_node_layer1), name="b_h1_tr")
        out_h1_tr = tf.nn.relu(tf.add(mat_h1_tr, b_h1_tr))

        w_h2_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer1, nb_node_layer2)), name = "w_h2_tr")
        mat_h2_tr = tf.matmul(out_h1_tr, w_h2_tr)
        b_h2_tr = tf.Variable(tf.zeros(nb_node_layer2), name="b_h2_tr")
        out_h2_tr = tf.nn.relu(tf.add(mat_h2_tr, b_h2_tr))

        w_final_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer2, 1)), name = "w_final_tr")
        outputs_tr =  tf.reshape(tf.matmul(out_h2_tr, w_final_tr), shape=[batch_size])
        ytr = tf.reshape(ytr, [batch_size])

        # Layers test
        mat_h1_te = tf.matmul(xte, w_h1_tr)
        out_h1_te = tf.nn.relu(tf.add(mat_h1_te, b_h1_tr))

        mat_h2_te = tf.matmul(out_h1_te, w_h2_tr)
        out_h2_te = tf.nn.relu(tf.add(mat_h2_te, b_h2_tr))

        outputs_te =  tf.reshape(tf.matmul(out_h2_te, w_final_tr), shape=[batch_size])
        yte = tf.reshape(yte, [batch_size])

        # Cost => MAPE
        train_cost = tf.keras.losses.MAPE(ytr, outputs_tr)
        test_cost = tf.keras.losses.MAPE(yte, outputs_te)

        # Convergence function
        train_step = tf.train.AdamOptimizer(learning_rate=1).minimize(train_cost)

        # train step with a lower learning rate => gain few % at the end
        tiny_train_step = tf.train.AdamOptimizer(learning_rate=0.01).minimize(train_cost)

        # allocate memory for tensors
        init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        for j in range(nb_epochs):
            if j < 20:
                sess.run(iterator_train.initializer)
                for i in range(nb_batch_train):
                    val = sess.run(train_step)
                #print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
            else:
                sess.run(iterator_train.initializer)
                for i in range(nb_batch_train):
                    val = sess.run(tiny_train_step)
                #print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
        sess.run(iterator_train.initializer)
        mape = 0
        for i in range(nb_batch_train):
            mape+=sess.run(train_cost)
        #print("Cout entrainement final =", mape/nb_batch_train)
        mape = 0
        sess.run(iterator_test.initializer)
        for i in range(nb_batch_test):
            mape+=sess.run(test_cost)
        res.append(mape/nb_batch_test)
print("Without :", np.mean(res))

Without : 6.210023446847464
