In [1]:
import h5py
import pandas as pd

In [2]:
cwes = set(open('datasets/combined-cwes.txt', 'r').read().strip().split('\n'))

In [None]:
cwes

In [4]:
vdisc_train_file = h5py.File('datasets/vdisc_train.hdf5')
vdisc_test_file = h5py.File('datasets/vdisc_test.hdf5')
vdisc_validate_file = h5py.File('datasets/vdisc_validate.hdf5')

In [5]:
def load_vdisc_to_pandas(vdisc_h45py):
    
    # Manually construct the data frame as a dict with key=<column name> and value=<list of
    # values for that column>.

    length = vdisc_h45py['functionSource'].size

    raw_dataframe = {
        'testcase_ID': ["vdisc_testcase_{}".format(index) for index in range(length)],
        'flaw_loc': [None]*length,
        'filename': [None]*length,
        'code': vdisc_h45py['functionSource'],
        'CWE-119': vdisc_h45py['CWE-119'],
        'CWE-120': vdisc_h45py['CWE-120'],
        'CWE-469': vdisc_h45py['CWE-469'],
        'CWE-476': vdisc_h45py['CWE-476'],
        'CWE-OTHERS': vdisc_h45py['CWE-other'],
    }

    empty_cwes = [
        cwe for cwe in cwes if cwe not in {
            'CWE-119',
            'CWE-120',
            'CWE-469',
            'CWE-476',
            'CWE-OTHERS',
        }
    ]

    for cwe in empty_cwes:
        raw_dataframe[cwe] = [False]*length


    return pd.DataFrame(data=raw_dataframe)

vdisc_train = load_vdisc_to_pandas(vdisc_train_file)
vdisc_test = load_vdisc_to_pandas(vdisc_test_file)
vdisc_validate = load_vdisc_to_pandas(vdisc_validate_file)

In [6]:
vdisc_train

Unnamed: 0,testcase_ID,flaw_loc,filename,code,CWE-119,CWE-120,CWE-469,CWE-476,CWE-OTHERS,CWE-259,...,CWE-843,CWE-534,CWE-114,CWE-535,CWE-226,CWE-247,CWE-366,CWE-482,CWE-832,CWE-675
0,vdisc_testcase_0,,,"b'clear_area(int startx, int starty, int xsize...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,vdisc_testcase_1,,,b'ReconstructDuList(Statement* head)\n{\n S...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,vdisc_testcase_2,,,b'free_speaker(void)\n{\n if(Lengths)\n ...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,vdisc_testcase_3,,,b'mlx4_register_device(struct mlx4_dev *dev)\n...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,vdisc_testcase_4,,,"b'Parse_Env_Var(void)\n{\n char *p = getenv(""...",True,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019466,vdisc_testcase_1019466,,,b'visitICmpInst(ICmpInst &I) {\n // We must c...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1019467,vdisc_testcase_1019467,,,b'path_node_delete_chain(path_node * head)\n{\...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1019468,vdisc_testcase_1019468,,,b'addrconf6_start (NMDevice *self)\n{\n\tNMDev...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1019469,vdisc_testcase_1019469,,,"b'Next(const SQObjectPtr &refpos, SQObjectPtr ...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
vdisc_train["CWE-119"].unique()

array([False,  True])

In [12]:
vdisc_train.to_csv('datasets/vdisc_train.csv')
vdisc_test.to_csv('datasets/vdisc_test.csv')
vdisc_validate.to_csv('datasets/vdisc_validate.csv')

In [14]:
df=pd.read_csv('datasets/vdisc_train.csv')
df

Unnamed: 0.1,Unnamed: 0,testcase_ID,flaw_loc,filename,code,CWE-119,CWE-120,CWE-469,CWE-476,CWE-OTHERS,...,CWE-843,CWE-534,CWE-114,CWE-535,CWE-226,CWE-247,CWE-366,CWE-482,CWE-832,CWE-675
0,0,vdisc_testcase_0,,,"b'clear_area(int startx, int starty, int xsize...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,vdisc_testcase_1,,,b'ReconstructDuList(Statement* head)\n{\n S...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,vdisc_testcase_2,,,b'free_speaker(void)\n{\n if(Lengths)\n ...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,vdisc_testcase_3,,,b'mlx4_register_device(struct mlx4_dev *dev)\n...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,vdisc_testcase_4,,,"b'Parse_Env_Var(void)\n{\n char *p = getenv(""...",True,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019466,1019466,vdisc_testcase_1019466,,,b'visitICmpInst(ICmpInst &I) {\n // We must c...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1019467,1019467,vdisc_testcase_1019467,,,b'path_node_delete_chain(path_node * head)\n{\...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1019468,1019468,vdisc_testcase_1019468,,,b'addrconf6_start (NMDevice *self)\n{\n\tNMDev...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1019469,1019469,vdisc_testcase_1019469,,,"b'Next(const SQObjectPtr &refpos, SQObjectPtr ...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
df["flaw_loc"].unique()

array([nan])