### Goal : Tests with GZIP compression

--

### Results => less than 3% train, 4% test


In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tuxml

In [10]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
name_var = 'GZIP-vmlinux'
selection[len(selection)] = name_var
features = features[selection].replace([2,-1],0)

In [11]:
n = 65000
sizes = np.array(features[0:n][name_var])
x_train, x_test, y_train, y_test = train_test_split(features.drop(name_var, axis=1)[0:n], sizes, test_size = 0.1)

nbCol = len(features.columns)

x_train = np.array(x_train, dtype = np.float32)
x_test =  np.array(x_test, dtype = np.float32)

y_train = np.array(y_train, dtype = np.float32)
y_test = np.array(y_test, dtype = np.float32)

nb_features = x_train.shape[1]
batch_size = 50
nb_epochs = 30
nb_batch_train = int(len(x_train)/batch_size)-1
nb_batch_test = int(len(x_test)/batch_size)
nb_node_layer1 = 200
nb_node_layer2 = 300

# slice the datasets => feed_dict was very slow, so I choose an iterator solution
dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
iterator_train = tf.compat.v1.data.make_initializable_iterator(dataset_train)
xtr, ytr = iterator_train.get_next()

dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
iterator_test = tf.compat.v1.data.make_initializable_iterator(dataset_test)
xte, yte = iterator_test.get_next()

with tf.device("/gpu:0"):
    # Layers training
    w_h1_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_features, nb_node_layer1)), name = "w_h1_tr")
    mat_h1_tr = tf.matmul(xtr, w_h1_tr)
    b_h1_tr = tf.Variable(tf.zeros(nb_node_layer1), name="b_h1_tr")
    out_h1_tr = tf.nn.relu(tf.add(mat_h1_tr, b_h1_tr))

    w_h2_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer1, nb_node_layer2)), name = "w_h2_tr")
    mat_h2_tr = tf.matmul(out_h1_tr, w_h2_tr)
    b_h2_tr = tf.Variable(tf.zeros(nb_node_layer2), name="b_h2_tr")
    out_h2_tr = tf.nn.relu(tf.add(mat_h2_tr, b_h2_tr))

    w_final_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer2, 1)), name = "w_final_tr")
    outputs_tr =  tf.reshape(tf.matmul(out_h2_tr, w_final_tr), shape=[batch_size])
    ytr = tf.reshape(ytr, [batch_size])

    # Layers test
    mat_h1_te = tf.matmul(xte, w_h1_tr)
    out_h1_te = tf.nn.relu(tf.add(mat_h1_te, b_h1_tr))

    mat_h2_te = tf.matmul(out_h1_te, w_h2_tr)
    out_h2_te = tf.nn.relu(tf.add(mat_h2_te, b_h2_tr))

    outputs_te =  tf.reshape(tf.matmul(out_h2_te, w_final_tr), shape=[batch_size])
    yte = tf.reshape(yte, [batch_size])

    # Cost => MAPE
    train_cost = tf.keras.losses.MAPE(ytr, outputs_tr)
    test_cost = tf.keras.losses.MAPE(yte, outputs_te)

    # Convergence function => AdamOptimizer
    train_step = tf.train.AdamOptimizer(learning_rate=0.5).minimize(train_cost)

    # train step with a lower learning rate => gain few % at the end
    tiny_train_step = tf.train.AdamOptimizer(learning_rate=0.025).minimize(train_cost)
    
    # allocate memory for tensors
    init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for j in range(nb_epochs):
        if j < 20:
            sess.run(iterator_train.initializer)
            for i in range(nb_batch_train):
                val = sess.run(train_step)
            print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
        else:
            sess.run(iterator_train.initializer)
            for i in range(nb_batch_train):
                val = sess.run(tiny_train_step)
            print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
    sess.run(iterator_train.initializer)
    mape = 0
    for i in range(nb_batch_train):
        mape+=sess.run(train_cost)
    print("Cout entrainement final =", mape/nb_batch_train)
    sess.run(iterator_test.initializer)
    mape = 0
    for i in range(nb_batch_test):
        mape+=sess.run(test_cost)
    print("Cout test final =", mape/nb_batch_test)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Cout entrainement epoch n° 1 : 6.52307
Cout entrainement epoch n° 2 : 4.981721
Cout entrainement epoch n° 3 : 5.2815256
Cout entrainement epoch n° 4 : 5.232669
Cout entrainement epoch n° 5 : 5.479763
Cout entrainement epoch n° 6 : 5.5792484
Cout entrainement epoch n° 7 : 6.0072956
Cout entrainement epoch n° 8 : 4.9662614
Cout entrainement epoch n° 9 : 4.779847
Cout entrainement epoch n° 10 : 4.7741556
Cout entrainement epoch n° 11 : 4.8716993
Cout entrainement epoch n° 12 : 4.5085325
Cout entrainement epoch n° 13 : 4.2119102
Cout entrainement epoch n° 14 : 4.239505
Cout entrainement epoch n° 15 : 3.9789817
Cout entrainement epoch n° 16 : 3.9564903
Cout entrainement epoch n° 17 : 4.6349688
Cout entrainement epoch n° 18 : 4.7742043
Cout entrainement epoch n° 19 : 4.1841726
Cout entrainement epoch n° 20 : 4.5281725
Cout entrainement epoch n° 21 : 3.9010546
Cout entrainem