# Performance prediction with x264

First, import the dataset.

In [2]:
import pandas as pd

df = pd.read_csv("dataset_x264.csv")

df.head()

Unnamed: 0,H264,no_8x8dct,no_asm,no_cabac,no_deblock,no_fast_pskip,no_mbtree,no_mixed_refs,no_weightb,rc_lookahead,ref,time
0,True,True,False,False,True,True,False,True,True,20,9,3.444
1,True,True,False,True,False,True,False,False,True,40,9,4.744
2,True,True,False,False,True,False,True,True,False,40,1,2.427
3,True,True,False,True,False,True,True,True,False,40,9,3.447
4,True,False,False,False,True,False,False,True,False,60,5,2.957


The dataset contains a list of configurations of x264 and the measured time for encoding a video with de given configuration.

We split the dataset to have a training set and a test set.

In [4]:
X = df.drop(columns=["time"])
y = df["time"]

from sklearn.model_selection import train_test_split

test_size=0.9
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

Here, instead of a classifier, we will use a regressor.

In [5]:
from sklearn import tree

reg = tree.DecisionTreeRegressor(max_depth=4)

Train it.

In [6]:
reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

And test it.

Mean absolute error (MAE) gives a mean of the distance between the predicted value and the true value.

In [8]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, reg.predict(X_test))

0.13836838505970617

Mean squared error (MSE) gives a mean of the squared distance between the predicted value and the true value. It is meant to be more punishing for when some prediction are far from the truth.

In [9]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, reg.predict(X_test))

0.030733518553590182

Mean absolute percentage error (MAPE) is a normalized MAE.

In [10]:
import numpy as np
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mean_absolute_percentage_error(y_test, reg.predict(X_test))

4.417077341278113

To interpret the model, we can print it.

In [11]:
import graphviz

def print_tree(clf, f_names, name):
    
    dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=f_names,  
                         filled=True, rounded=True,
                         special_characters=True)  
    graph = graphviz.Source(dot_data)  
    graph.render(name)
    
print_tree(reg, X_train.columns.values, "tree")

We can also print the rules.

In [13]:
from sklearn.tree import _tree


def tree_to_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    #print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, previous_rules):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            #print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], previous_rules+[name + " <= " + str(threshold)])
            #print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], previous_rules+[name + " > " + str(threshold)])
        else:
            print(" & ".join(previous_rules) + " ---> " + str(tree_.value[node][0]))


    recurse(0, [])
    
tree_to_rules(reg, X_train.columns)

ref <= 7.0 & ref <= 3.0 & no_8x8dct <= 0.5 & no_mbtree <= 0.5 ---> [2.5554]
ref <= 7.0 & ref <= 3.0 & no_8x8dct <= 0.5 & no_mbtree > 0.5 ---> [2.78190909]
ref <= 7.0 & ref <= 3.0 & no_8x8dct > 0.5 & rc_lookahead <= 30.0 ---> [2.05833333]
ref <= 7.0 & ref <= 3.0 & no_8x8dct > 0.5 & rc_lookahead > 30.0 ---> [2.32676923]
ref <= 7.0 & ref > 3.0 & no_mixed_refs <= 0.5 & no_mbtree <= 0.5 ---> [3.6605]
ref <= 7.0 & ref > 3.0 & no_mixed_refs <= 0.5 & no_mbtree > 0.5 ---> [4.21611111]
ref <= 7.0 & ref > 3.0 & no_mixed_refs > 0.5 & no_mbtree <= 0.5 ---> [2.68191667]
ref <= 7.0 & ref > 3.0 & no_mixed_refs > 0.5 & no_mbtree > 0.5 ---> [3.1116]
ref > 7.0 & no_mixed_refs <= 0.5 & no_mbtree <= 0.5 & no_8x8dct <= 0.5 ---> [5.12588889]
ref > 7.0 & no_mixed_refs <= 0.5 & no_mbtree <= 0.5 & no_8x8dct > 0.5 ---> [4.76]
ref > 7.0 & no_mixed_refs <= 0.5 & no_mbtree > 0.5 & no_8x8dct <= 0.5 ---> [5.8686]
ref > 7.0 & no_mixed_refs <= 0.5 & no_mbtree > 0.5 & no_8x8dct > 0.5 ---> [5.3654]
ref > 7.0 & no_mixed_r

In [15]:

#Parameters
test_size=0.9

#Decision tree classifier parameters
#More details here : https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
hyperparams =  {
    "criterion":"mse",
    "splitter":"best",
    "max_depth":None,
    "min_samples_split":2,
    "min_samples_leaf":1,
    "min_weight_fraction_leaf":0.,
    "max_features":None,
    "random_state":None,
    "max_leaf_nodes":None,
    "min_impurity_decrease":1e-7,
    "presort":False
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
reg = tree.DecisionTreeRegressor(**hyperparams)
reg.fit(X_train, y_train)
mean_absolute_percentage_error(y_test, reg.predict(X_test))

2.4268789769569086