In [1]:
import pandas as pd
import json

with open("option_columns.json","r") as f:
    option_columns = json.load(f)

#Find the dataset
df = pd.read_csv("dataset_encoded_size.csv", dtype={k:"int8" for k in option_columns})

df.query("cid >= 30000", inplace=True)
df.fillna(-1, inplace=True)
df.query("kernel_size >= 0", inplace=True)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import tree

def runML(hyperparams, with_tree=False):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="cid").drop(columns="kernel_size"), df["kernel_size"], test_size=0.1, random_state=hyperparams["random_state"])  
    reg = tree.DecisionTreeRegressor(**hyperparams)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    dfErrors = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred, "error":(y_pred - y_test).abs(), "% error":((y_pred - y_test)/y_test).abs()*100})
    if with_tree:
        return dfErrors["% error"].describe(), reg
    return dfErrors["% error"].describe()

In [3]:

hyperparams = {
    "criterion":"mse",
    "max_depth":12,
    "min_samples_leaf":5,
    "max_leaf_nodes":None,
    "random_state":2
}
res, reg = runML(hyperparams, with_tree=True)

In [4]:
col = df.drop(columns="cid").drop(columns="kernel_size").columns
importanceSeries = pd.Series(reg.feature_importances_, index=col.values)
importanceSeries[importanceSeries > 0].sort_values(ascending=False)

DEBUG_INFO                  3.353856e-01
RANDOMIZE_BASE              1.302672e-01
DEBUG_INFO_REDUCED          1.130614e-01
DEBUG_INFO_SPLIT            8.640952e-02
STRICT_MODULE_RWX           5.291101e-02
BLK_MQ_PCI                  3.956938e-02
MODULES                     3.568542e-02
UBSAN_SANITIZE_ALL          3.452231e-02
X86_NEED_RELOCS             1.873495e-02
KASAN                       1.080636e-02
FB_SVGALIB                  1.022115e-02
MDIO_DEVICE                 8.304272e-03
UBSAN_ALIGNMENT             7.775686e-03
KASAN_OUTLINE               6.216344e-03
GCOV_PROFILE_ALL            5.991576e-03
ARCH_MIGHT_HAVE_ACPI_PDC    5.910603e-03
SWPHY                       4.789978e-03
INTEL_GTT                   3.788891e-03
DST_CACHE                   3.737057e-03
XFS_DEBUG                   2.934081e-03
MDIO                        2.770754e-03
DRM_TTM                     2.526473e-03
SCSI_ISCSI_ATTRS            2.247725e-03
TUN_VNET_CROSS_LE           2.022638e-03
GRO_CELLS       

In [5]:
importanceSeries[importanceSeries > 0].sort_values(ascending=False).to_csv("feature_importance.csv")

  """Entry point for launching an IPython kernel.


In [6]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble

def runML_RF(hyperparams, with_tree=False):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="cid").drop(columns="kernel_size"), df["kernel_size"], test_size=0.1)  
    reg = ensemble.RandomForestRegressor(**hyperparams)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    dfErrors = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred, "error":(y_pred - y_test).abs(), "% error":((y_pred - y_test)/y_test).abs()*100})
    if with_tree:
        return dfErrors["% error"].describe(), reg
    return dfErrors["% error"].describe()


In [7]:
hyperparams = {
    "max_depth":18,
    "n_estimators":50,
    
}
resrf, regrf = runML_RF(hyperparams, with_tree=True)
resrf

count    9248.000000
mean       10.462155
std        10.240255
min         0.001271
25%         3.577800
50%         7.997394
75%        14.152988
max       160.135533
Name: % error, dtype: float64

In [8]:
importanceRF = pd.Series(regrf.feature_importances_, index=col.values)
importanceRF[importanceRF > 0].sort_values(ascending=False)

DEBUG_INFO                  3.305341e-01
DEBUG_INFO_REDUCED          1.112772e-01
DEBUG_INFO_SPLIT            8.521497e-02
RANDOMIZE_BASE              7.445507e-02
X86_NEED_RELOCS             6.910525e-02
STRICT_MODULE_RWX           3.745045e-02
MODULES                     3.420450e-02
UBSAN_SANITIZE_ALL          3.346956e-02
BLK_MQ_PCI                  2.287366e-02
FB_SVGALIB                  1.149733e-02
KASAN_OUTLINE               7.961296e-03
DRM_NOUVEAU                 7.381284e-03
KASAN                       6.277975e-03
UBSAN_ALIGNMENT             5.919627e-03
MDIO                        5.367520e-03
GCOV_PROFILE_ALL            5.235003e-03
FB_DDC                      3.918975e-03
MDIO_THUNDER                3.812828e-03
DRM_TTM                     3.518324e-03
XFS_DEBUG                   2.854037e-03
DRM_AMDGPU                  2.677830e-03
SCSI_ISCSI_ATTRS            2.294381e-03
DRM_RADEON                  2.193272e-03
MII                         1.575236e-03
GRO_CELLS       