In [1]:
import tuxml 
import pandas as pd

df = tuxml.load_dataset_415()

size_methods = ["vmlinux", "GZIP-bzImage", "GZIP-vmlinux", "GZIP", "BZIP2-bzImage", 
                  "BZIP2-vmlinux", "BZIP2", "LZMA-bzImage", "LZMA-vmlinux", "LZMA", "XZ-bzImage", "XZ-vmlinux", "XZ", 
                  "LZO-bzImage", "LZO-vmlinux", "LZO", "LZ4-bzImage", "LZ4-vmlinux", "LZ4"]

derived_features = ['nbyes', 'nbyesmodule', 'nbno', 'nbmodule']
tri_state_values = [0, 1, 2]

ftuniques = []
freq_ymn_features = []
non_tristate_options = []

for col in df:
    if col == "cid":
        continue
    ft = df[col]    
    # eg always "y"
    if len(ft.unique()) == 1 and all(x in tri_state_values for x in ft.unique()):
        ftuniques.append(col)
    elif all(x in tri_state_values for x in ft.unique()): 
        continue
    else:
        if not (col in size_methods) and not (col in derived_features): 
            non_tristate_options.append(col)


In [2]:
len(non_tristate_options), len(ftuniques)

(169, 3329)

In [3]:
df.drop(columns=non_tristate_options, inplace=True)
df.drop(columns=ftuniques, inplace=True)

In [4]:
df.shape

(39370, 9524)

In [5]:
df[:10]

Unnamed: 0,NETFILTER_XT_MATCH_CONNMARK,NET_EMATCH,TOUCHSCREEN_AD7877,REGULATOR_88PM8607,DVB_USB_CXUSB,CRYPTO_SHA512_MB,NETFILTER_XT_MATCH_CONNTRACK,LAPBETHER,TOUCHSCREEN_AD7879,REGULATOR_ACT8865,...,LZO-vmlinux,LZO,LZ4-bzImage,LZ4-vmlinux,LZ4,cid,nbyes,nbno,nbmodule,nbyesmodule
1,0,0,0,1,0,1,0,0,0,1,...,14874088,12650651,13766832,15853424,13618427,29787,1368,11534,3,1371
2,0,0,0,0,0,0,0,0,0,0,...,14332720,12029959,13612304,15701184,13389490,29788,2490,10403,3,2493
3,0,0,1,0,0,1,0,0,1,1,...,23888936,21666603,23593552,25679288,23445627,29789,1991,10899,2,1993
4,0,0,0,0,0,0,0,0,0,0,...,11638024,9415719,10482736,12568216,10336659,29790,1614,10222,1061,2675
5,0,0,0,0,0,0,0,2,0,0,...,14021896,11802122,12969008,15054488,12822792,29791,1289,10769,834,2123
6,0,0,0,0,0,1,0,0,0,1,...,16069672,13847364,15210064,17294776,15061852,29792,2528,10355,2,2530
7,1,1,0,0,0,1,1,0,0,0,...,18762424,16482568,18384560,20470856,18181298,29793,2957,9923,3,2960
8,0,0,0,0,0,1,0,0,0,1,...,17675656,15451988,16994896,19080976,16843409,29794,2754,10132,1,2755
9,0,0,0,0,0,0,0,0,0,1,...,18957792,16733667,18452656,20539248,18301538,29795,2523,10376,3,2526
10,0,0,0,0,0,0,0,0,0,0,...,6669696,4446491,5029968,7116560,4879237,29798,842,11433,625,1467


In [6]:
df.sort_values(by='vmlinux', ascending=True)[['vmlinux', 'nbno', 'nbyes', 'nbmodule', 'nbyesmodule']][:10]

Unnamed: 0,vmlinux,nbno,nbyes,nbmodule,nbyesmodule
16341,11542064,12665,251,1,252
33523,12021584,12664,252,1,253
29706,16306416,11179,999,727,1726
38048,16417304,11471,841,599,1440
9051,16513912,11558,805,545,1350
4017,16613240,11823,1084,2,1086
23624,16650856,11217,892,801,1693
13820,16680656,11281,1031,596,1627
20925,16819208,11031,1098,770,1868
3703,16824376,11222,1120,558,1678


In [7]:
# df.to_pickle("all_size_withyes.pkl")

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np


def run_regressorML(reg, test_size, size_target="vmlinux"):
    assert(size_target in size_methods)
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["cid"]).drop(columns=size_methods), df[size_target], test_size=test_size, random_state=42)  
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    dfErrors = pd.DataFrame({"error":(y_pred - y_test).abs(), "% error":((y_pred - y_test) / y_test).abs()*100})
    return dfErrors["% error"].describe()


In [13]:

   
hyperparams_DT = {
    "criterion": "mse",
    "max_depth": 12,
    "min_samples_leaf": 5,
    "max_leaf_nodes": None,
}

#reg_dt = tree.DecisionTreeRegressor(**hyperparams_DT)
#res_dt = run_regressorML(reg_dt, test_size=0.9)
#res_dt

hyperparams_RF = {
#    "max_depth": 18,
    "n_estimators": 50,
    "n_jobs" : -1
}

reg_rf = ensemble.RandomForestRegressor(**hyperparams_RF)
res_rf = run_regressorML(reg_rf, test_size=0.7, size_target="vmlinux")
res_rf

#hyperparams_GB = {
#    'learning_rate': 0.1, 
   # 'max_depth': 5, 
#    'n_estimators': 300    
#}

#reg_gb = GradientBoostingRegressor()
#res_gb = run_regressorML(reg_gb, test_size=0.9, size_target="vmlinux")
#res_gb


# run_regressorML(ensemble.AdaBoostRegressor(), test_size=0.98, size_target="vmlinux")

count    27559.000000
mean        13.058028
std         14.407322
min          0.002017
25%          4.355138
50%          9.372619
75%         16.915725
max        319.576438
Name: % error, dtype: float64

In [14]:
def mk_ftimportances(reg):
    col = df.drop(columns=["cid"]).drop(columns=size_methods).columns
    importanceSeries = pd.Series(reg.feature_importances_, index=col.values)
    importanceSeries[importanceSeries > 0].sort_values(ascending=False)

    ftimportance = pd.Series(reg.feature_importances_, index=col.values)
    return ftimportance[ftimportance > 0].sort_values(ascending=False)

#ft_dt = mk_ftimportances(reg_dt)
#ft_dt.to_csv("feature_importanceDT.csv", header=True)
#ft_dt

ft_rf = mk_ftimportances(reg_rf)
ft_rf.to_csv("feature_importanceRF-415.csv", header=True)
ft_rf

#ft_gb = mk_ftimportances(reg_gb)
#ft_gb.to_csv("feature_importanceGB.csv", header=True)
#ft_gb

DEBUG_INFO                     2.516341e-01
nbyes                          1.910530e-01
DEBUG_INFO_REDUCED             1.260444e-01
DEBUG_INFO_SPLIT               1.092887e-01
RANDOMIZE_BASE                 7.849575e-02
X86_NEED_RELOCS                7.285283e-02
UBSAN_SANITIZE_ALL             3.536812e-02
KASAN                          7.826984e-03
KASAN_OUTLINE                  7.771823e-03
GCOV_PROFILE_ALL               6.372792e-03
nbno                           4.625143e-03
UBSAN_ALIGNMENT                4.092593e-03
EDAC_AMD64                     2.385859e-03
XFS_DEBUG                      1.954520e-03
nbyesmodule                    1.828178e-03
NFC_MEI_PHY                    1.314387e-03
DRM_NOUVEAU                    1.210749e-03
BLK_MQ_PCI                     8.486326e-04
NFC_HCI                        7.253134e-04
DRM_VBOXVIDEO                  6.929199e-04
XFS_FS                         6.740350e-04
FB_TFT_RA8875                  5.000435e-04
DRM_PANEL_SITRONIX_ST7789V     4

In [None]:
####### column' indices of feature importances
# TODO: enhance!
if (False):
    col = df.drop(columns=["cid"]).drop(columns=size_methods).columns
    ftIndiceImportanceGB = []
    for ft_name in ft_rf.index:
        i = 0
        for vl in col.values:
            if (ft_name == vl):
                ftIndiceImportanceGB.append(i)
    #            print(ft_name, "at", i)
                continue
            i = i + 1
    ftIndiceImportanceGB

    # TODO: instead of plotting, retrieving the coefficients
    col = df.drop(columns=["cid"]).drop(columns=size_methods).columns
    import matplotlib.pyplot as plt
    # TODO: pair-wise feature interactions
    ftsDep = ftIndiceImportanceGB[:20] # importanceGB[importanceGB > 0].sort_values(ascending=False)[:10].values
    plt.figure(figsize=(20, 20))
    # TODO: X_train is an input! (it works only if X_train is accessible/visible)
    fig, ax = plot_partial_dependence(reg_rf, X_train, ftsDep, feature_names=col.values, grid_resolution=100)
    fig.set_figwidth(8)
    fig.set_figheight(15)
    fig.tight_layout()
    plt.show()