In [1]:
import os
os.chdir("..")

from pprint import pprint
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from tuxai.dataset import Dataset, Columns
from tuxai.misc import config_logger
from tuxai.report import model_metrics, Report, FeatureImportanceReport

config_logger()
pd.options.display.max_rows = 300
pd.options.display.max_colwidth = 100


In [2]:
from tuxai.misc import cache
c = cache()
key = "fir_2022_12_21"
# del c[key]
if key in c:
    fir = c[key]
else:
    fir = FeatureImportanceReport()
    c[key] = fir


## Which options are importants, regardless of version? (always < top 30)

## 1 - no compression, with collinearity

In [3]:
fir.options_always_importants(target="vmlinux", rank=30, collinearity=True)

  0%|          | 0/7242 [00:00<?, ?it/s]

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
27,nb_yes,0.0,1.0,1.0,0.0,0.0,0.0,0.0
21,"RANDOMIZE_BASE, X86_NEED_RELOCS",,0.0,0.0,1.0,1.0,1.0,1.0
20,RANDOMIZE_BASE,1.0,,,,,,
3,DEBUG_INFO,2.0,2.0,3.0,2.0,3.0,3.0,2.0
1,"AMD_MEM_ENCRYPT, DMA_COHERENT_POOL, ARCH_HAS_FORCE_DMA_UNENCRYPTED, DYNAMIC_PHYSICAL_MASK",,,,,,,3.0
15,"KASAN_OUTLINE, KASAN",4.0,3.0,2.0,,,,
14,"KASAN_GENERIC, KASAN_OUTLINE, KASAN",,,,3.0,2.0,4.0,
0,"AMD_MEM_ENCRYPT, ARCH_HAS_FORCE_DMA_UNENCRYPTED, DYNAMIC_PHYSICAL_MASK",,,,,5.0,2.0,
24,UBSAN_SANITIZE_ALL,3.0,4.0,4.0,4.0,4.0,8.0,8.0
7,DEBUG_INFO_REDUCED,5.0,5.0,5.0,5.0,6.0,5.0,5.0


## 2 - with compression, with collinearity

In [4]:

fir.options_always_importants(target="BZIP2-vmlinux", rank=30, collinearity=True)


  0%|          | 0/7257 [00:00<?, ?it/s]

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
25,nb_yes,0.0,1.0,0.0,1.0,0.0,0.0,0.0
11,"KASAN_OUTLINE, KASAN",1.0,0.0,1.0,,,,
10,"KASAN_GENERIC, KASAN_OUTLINE, KASAN",,,,0.0,1.0,1.0,
22,UBSAN_SANITIZE_ALL,2.0,3.0,3.0,2.0,3.0,2.0,1.0
7,GCOV_PROFILE_ALL,3.0,2.0,2.0,3.0,2.0,5.0,3.0
12,KCOV_INSTRUMENT_ALL,4.0,5.0,5.0,4.0,6.0,3.0,2.0
8,IKHEADERS,,,,,5.0,4.0,5.0
16,"RANDOMIZE_BASE, X86_NEED_RELOCS",,4.0,4.0,5.0,4.0,7.0,6.0
19,UBSAN_ALIGNMENT,5.0,6.0,6.0,6.0,7.0,6.0,4.0
15,RANDOMIZE_BASE,6.0,,,,,,


## 3 - with compression, without collinearity : collinear options are lost

In [5]:
fir.options_always_importants(target="BZIP2-vmlinux", rank=30, collinearity=False)

  0%|          | 0/6811 [00:00<?, ?it/s]

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
18,nb_yes,0.0,1.0,0.0,1.0,0.0,0.0,0.0
7,KASAN,1.0,0.0,1.0,0.0,1.0,1.0,
15,UBSAN_SANITIZE_ALL,2.0,3.0,3.0,2.0,3.0,3.0,1.0
5,GCOV_PROFILE_ALL,3.0,2.0,2.0,3.0,2.0,5.0,5.0
6,IKHEADERS,,,,,5.0,2.0,4.0
8,KCOV_INSTRUMENT_ALL,6.0,5.0,5.0,5.0,6.0,4.0,2.0
10,RANDOMIZE_BASE,5.0,4.0,4.0,4.0,4.0,6.0,7.0
12,UBSAN_ALIGNMENT,4.0,6.0,6.0,6.0,7.0,7.0,3.0
9,PRINTK,7.0,7.0,8.0,7.0,9.0,10.0,8.0
0,BINARY_PRINTF,11.0,10.0,7.0,8.0,8.0,9.0,6.0


## Which options are NEVER importants, regardless of version? (always > top 300)

## 1 - no compression

In [6]:
df = fir.options_never_importants(target="vmlinux", rank=300, collinearity=True)
df

  0%|          | 0/7242 [00:00<?, ?it/s]

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
1887,I2C_ROBOTFUZZ_OSIF,,,,,,2191,
1902,I2C_XLR,,,,,,2188,
1927,IDLE_INJECT,,,,,,2181,
1932,IEEE802154_AT86RF230_DEBUGFS,,,,2166,,2179,
2379,LCD_OTM3225A,,,,2171,,,
...,...,...,...,...,...,...,...,...
4634,SPEAKUP_SYNTH_DUMMY,,,,,,,304
4709,STRICT_DEVMEM,,,,,,303,
868,CRYPTO_AEGIS128L,,,302,,,,
930,CRYPTO_LIB_CHACHA,,,,,,,301


In [7]:
df.dropna()

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
365,BEFS_FS,1096,433,1526,1567,468,486,1569


## 2 - with compression

In [8]:
df = fir.options_never_importants(target="BZIP2-vmlinux", rank=300, collinearity=True)
df

  0%|          | 0/7257 [00:00<?, ?it/s]

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
2044,I2C_MLXCPLD,,,,,,2124,
2048,I2C_MUX_GPIO,,,,,,2123,
2055,I2C_MV64XXX,,,,,,2122,
2062,I2C_PNX,,,,,,2118,
2063,I2C_QCOM_GENI,,,,,,2117,
...,...,...,...,...,...,...,...,...
3694,PM_SLEEP_DEBUG,,,,,304,,
3763,PWM_FSL_FTM,,,,,303,,
1621,FB_TFT_S6D1121,,,303,,,,
1593,FB_RIVA_BACKLIGHT,,301,,,,,


In [9]:
df.dropna()

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
2715,MAX11100,1383,364,438,431,1618,1696,1988


# Options not always importants (stop or start beeing important)

In [11]:
fir.options_not_always_importants(target="vmlinux", best_rank=30, worst_rank=300, collinearity=True)

  0%|          | 0/7242 [00:00<?, ?it/s]

Unnamed: 0,options,4.13,4.15,4.20,5.00,5.04,5.07,5.08
3,"AMD_MEM_ENCRYPT, DMA_COHERENT_POOL, ARCH_HAS_FORCE_DMA_UNENCRYPTED, DYNAMIC_PHYSICAL_MASK",,,,,,,3.0
0,"AMD_MEM_ENCRYPT, ARCH_HAS_FORCE_DMA_UNENCRYPTED, DYNAMIC_PHYSICAL_MASK",,,,,5.0,2.0,
19,"SND_SOC_ALL_CODECS, SND_SOC_CS47L85, SND_SOC_CS47L92, SND_SOC_CS47L15, SND_SOC_CS47L90, SND_SOC_...",,,,,,,20.0
5,"BRANCH_PROFILE_NONE, FTRACE",,27.0,,,,,
10,EROFS_FS_ZIP,,,,,34.0,29.0,26.0
20,"SND_SOC_ALL_CODECS, SND_SOC_CS47L85, SND_SOC_CS47L92, SND_SOC_CS47L15, SND_SOC_CS47L90, SND_SOC_...",,,,,,31.0,
4,BRANCH_PROFILE_NONE,31.0,,,,,,
15,"SND_SOC_AD1980, SND_SOC_ALL_CODECS, SND_SOC_WM8782, SND_SOC_ADS117X, SND_SOC_L3, SND_SOC_ICS4343...",56.0,,,,,,
6,CONSTRUCTORS,90.0,76.0,61.0,30.0,118.0,45.0,
12,FTRACE,188.0,,,,,,
