### Goal : Select the best set of features to get the lowest test MAPE with nets

--

### Results => 


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tuxml

## [BEST] Original features (tree) => 6%

Could it be because we adjust the first hyperparameters to these features?

At first, I would have bet on the random forest features.


In [4]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_net.csv", index_col = 0, skiprows = 1, names = ['feat'])['feat']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1],0)

## Random Forest => 9%
Try with random forest importances -> bad results (best test MAPE around 9%)

In [16]:
# we download the dataset using the tuxml script
features = tuxml.load_dataset()

# then we select the features based on random forest importance
df_selection = pd.read_csv('feature_importanceRF.csv', names = ['features', 'imp'], skiprows = 0)
selection = df_selection['features']
imp = df_selection['imp']

# we select the more important features until we get more than alpha % of the 'predictive power'
alpha = 0.98
sorted_imp = sorted(imp, reverse = True)
sum_imp = 0
i = 0

while sum_imp < alpha:
    sum_imp+=sorted_imp[i]
    i+=1
threshold_imp = sorted_imp[i]

final = selection[imp>threshold_imp]
final[len(final)] = 'vmlinux'

# selection
features = features[final]

# normalization of the nb yes/no/module counters
col = features.columns
# I assumed 12611 was (around) the number of real options we can change to compile the linux kernel => cf option_columns.json
nb_features = 12611
to_normalize = ['nbyes', 'nbno', 'nbmodule', 'nbyesmodule']
for feat in to_normalize:
    if feat in col:
        features[feat] = features[feat]/nb_features
print(len(final), "features kept for", 100*alpha, "% of the predictive power")
#features.to_pickle("feature_rf_98.pkl")
features = features.replace([2,-1],0)

1248 features kept for 98.0 % of the predictive power


## features DT => 16%

In [33]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_importanceDT.csv", skiprows = 1, names = ['features','imp'])['features']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1],0)

## features EN => 11%

I selected the 1200 more important features, to be around the number of features we used for the tree/RF

In [38]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_importanceEN.csv", skiprows = 1, names = ['features','imp'])['features'][0:1200]
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1], 0)

## features GB => 12%

Impressive results for only 150 features + no major difference between train/test

In [40]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_importanceGB.csv", skiprows = 1, names = ['features','imp'])['features']
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1], 0)

## features Lasso => 12%

In [43]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_importanceLasso.csv", skiprows = 1, names = ['features','imp'])['features'][0:1200]
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1], 0)

## features LR => 26%

In [44]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_importanceLR.csv", skiprows = 1, names = ['features','imp'])['features'][0:1200]
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1], 0)

## features Ridge => 19%

training MAPE around 11-12%, which is a huge difference between train/test

In [46]:
features = tuxml.load_dataset()
selection = pd.read_csv("feature_importanceRidge.csv", skiprows = 1, names = ['features','imp'])['features'][0:1200]
selection[len(selection)] = 'vmlinux'
features = features[selection].replace([2,-1], 0)

## Test cell

In [47]:
n = 65000
sizes = np.array(features[0:n]['vmlinux'])
x_train, x_test, y_train, y_test = train_test_split(features.drop('vmlinux', axis=1)[0:n], sizes, test_size = 0.1)

nbCol = len(features.columns)

x_train = np.array(x_train, dtype = np.float32)
x_test =  np.array(x_test, dtype = np.float32)

y_train = np.array(y_train, dtype = np.float32)
y_test = np.array(y_test, dtype = np.float32)

nb_features = x_train.shape[1]
batch_size = 50
nb_epochs = 30
nb_batch_train = int(len(x_train)/batch_size)-1
nb_batch_test = int(len(x_test)/batch_size)
nb_node_layer1 = 200
nb_node_layer2 = 100

# slice the datasets => feed_dict was very slow, so I choose an iterator solution
dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
iterator_train = tf.compat.v1.data.make_initializable_iterator(dataset_train)
xtr, ytr = iterator_train.get_next()

dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
iterator_test = tf.compat.v1.data.make_initializable_iterator(dataset_test)
xte, yte = iterator_test.get_next()

with tf.device("/gpu:0"):
    # Layers training
    w_h1_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_features, nb_node_layer1)), name = "w_h1_tr")
    mat_h1_tr = tf.matmul(xtr, w_h1_tr)
    b_h1_tr = tf.Variable(tf.zeros(nb_node_layer1), name="b_h1_tr")
    out_h1_tr = tf.nn.relu(tf.add(mat_h1_tr, b_h1_tr))

    w_h2_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer1, nb_node_layer2)), name = "w_h2_tr")
    mat_h2_tr = tf.matmul(out_h1_tr, w_h2_tr)
    b_h2_tr = tf.Variable(tf.zeros(nb_node_layer2), name="b_h2_tr")
    out_h2_tr = tf.nn.relu(tf.add(mat_h2_tr, b_h2_tr))

    w_final_tr = tf.Variable(tf.glorot_uniform_initializer()((nb_node_layer2, 1)), name = "w_final_tr")
    outputs_tr =  tf.reshape(tf.matmul(out_h2_tr, w_final_tr), shape=[batch_size])
    ytr = tf.reshape(ytr, [batch_size])

    # Layers test
    mat_h1_te = tf.matmul(xte, w_h1_tr)
    out_h1_te = tf.nn.relu(tf.add(mat_h1_te, b_h1_tr))

    mat_h2_te = tf.matmul(out_h1_te, w_h2_tr)
    out_h2_te = tf.nn.relu(tf.add(mat_h2_te, b_h2_tr))

    outputs_te =  tf.reshape(tf.matmul(out_h2_te, w_final_tr), shape=[batch_size])
    yte = tf.reshape(yte, [batch_size])

    # Cost => MAPE
    train_cost = tf.keras.losses.MAPE(ytr, outputs_tr)
    test_cost = tf.keras.losses.MAPE(yte, outputs_te)

    # Convergence function => AdamOptimizer
    train_step = tf.train.AdamOptimizer(learning_rate=1).minimize(train_cost)

    # train step with a lower learning rate => gain few % at the end
    tiny_train_step = tf.train.AdamOptimizer(learning_rate=0.1).minimize(train_cost)
    
    # allocate memory for tensors
    init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for j in range(nb_epochs):
        if j < 20:
            sess.run(iterator_train.initializer)
            for i in range(nb_batch_train):
                val = sess.run(train_step)
            print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
        else:
            sess.run(iterator_train.initializer)
            for i in range(nb_batch_train):
                val = sess.run(tiny_train_step)
            print("Cout entrainement epoch n°", j+1, ":",  sess.run(train_cost))
    sess.run(iterator_train.initializer)
    mape = 0
    for i in range(nb_batch_train):
        mape+=sess.run(train_cost)
    print("Cout entrainement final =", mape/nb_batch_train)
    sess.run(iterator_test.initializer)
    mape = 0
    for i in range(nb_batch_test):
        mape+=sess.run(test_cost)
    print("Cout test final =", mape/nb_batch_test)

Cout entrainement epoch n° 1 : 25.00118
Cout entrainement epoch n° 2 : 19.082533
Cout entrainement epoch n° 3 : 18.239597
Cout entrainement epoch n° 4 : 18.586014
Cout entrainement epoch n° 5 : 17.332645
Cout entrainement epoch n° 6 : 18.850737
Cout entrainement epoch n° 7 : 18.131718
Cout entrainement epoch n° 8 : 20.164968
Cout entrainement epoch n° 9 : 17.934628
Cout entrainement epoch n° 10 : 16.293335
Cout entrainement epoch n° 11 : 16.661058
Cout entrainement epoch n° 12 : 16.478672
Cout entrainement epoch n° 13 : 15.913879
Cout entrainement epoch n° 14 : 16.414085
Cout entrainement epoch n° 15 : 17.398159
Cout entrainement epoch n° 16 : 16.19433
Cout entrainement epoch n° 17 : 16.623188
Cout entrainement epoch n° 18 : 16.504028
Cout entrainement epoch n° 19 : 15.260929
Cout entrainement epoch n° 20 : 15.97814
Cout entrainement epoch n° 21 : 16.613354
Cout entrainement epoch n° 22 : 16.531647
Cout entrainement epoch n° 23 : 16.656668
Cout entrainement epoch n° 24 : 16.532677
Cout