In [1]:
import warnings
warnings.filterwarnings('ignore')

import h5py

train_file_name = '../data/draper/raw/VDISC_train.hdf5'
train_file = h5py.File(train_file_name)

import clang.cindex
import clang.enumerations
import csv
import numpy as np
import os
# set the config
clang.cindex.Config.set_library_path("/usr/lib/x86_64-linux-gnu")
clang.cindex.Config.set_library_file('/usr/lib/x86_64-linux-gnu/libclang-6.0.so.1')



In [2]:
import re 

class Tokenizer:
    # creates the object, does the inital parse
    def __init__(self, path, tokenizer_type='original'):
        self.index = clang.cindex.Index.create()
        self.tu = self.index.parse(path)
        self.path = self.extract_path(path)
        self.symbol_table = {}
        self.symbol_count = 1
        self.tokenizer_type = tokenizer_type

    # To output for split_functions, must have same path up to last two folders
    def extract_path(self, path):
        return "".join(path.split("/")[:-2])

    
    def full_tokenize_cursor(self, cursor):
        tokens = cursor.get_tokens()
        result = []
        for token in tokens:
            if token.kind.name == "COMMENT":
                continue
            if token.kind.name == "LITERAL":
                result += self.process_literal(token)
                continue
            if token.kind.name == "IDENTIFIER":
                result += ["ID"]
                continue
            result += [token.spelling]
        return result

    def full_tokenize(self):
        cursor = self.tu.cursor
        return self.full_tokenize_cursor(cursor)

    def process_literal(self, literal):
        cursor_kind = clang.cindex.CursorKind
        kind = literal.cursor.kind
        if kind == cursor_kind.INTEGER_LITERAL:
            return literal.spelling
        if kind == cursor_kind.FLOATING_LITERAL:
            return literal.spelling
        if kind == cursor_kind.IMAGINARY_LITERAL:
            return ["NUM"]       
        if kind == cursor_kind.STRING_LITERAL:
            return ["STRING"]
        sp = literal.spelling
        if re.match('[0-9]+', sp) is not None:
            return sp
        return ["LITERAL"]

    def split_functions(self, method_only):
        results = []
        cursor_kind = clang.cindex.CursorKind
        cursor = self.tu.cursor
        for c in cursor.get_children():
            filename = c.location.file.name if c.location.file != None else "NONE"
            extracted_path = self.extract_path(filename)

            if (c.kind == cursor_kind.CXX_METHOD or (method_only == False and c.kind == cursor_kind.FUNCTION_DECL)) and extracted_path == self.path:
                name = c.spelling
                tokens = self.full_tokenize_cursor(c)
                filename = filename.split("/")[-1]
                results += [tokens]

        return results
    

def tokenize(file_text):
    try:
        c_file = open('/tmp/test1.c', 'w')
        c_file.write(file_text)
        c_file.close()
        tok = Tokenizer('/tmp/test1.c')
        results = tok.split_functions(False)
        return ' '.join(results[0])
    except:
        return None

In [3]:
list(train_file)

['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']

In [4]:
num_vul = 0
num_non_vul = 0
vul_indices = []

for idx, (a, b, c, d, e) in  enumerate(zip(
    train_file['CWE-119'], train_file['CWE-120'], train_file['CWE-469'], 
    train_file['CWE-476'], train_file['CWE-other']
)):
    if a or b or c or d or e:
        num_vul += 1
        vul_indices.append(idx)
    else:
        num_non_vul += 1

print(num_vul, num_non_vul, len(vul_indices))

65904 953567 65904


In [5]:
print(tokenize("int main(){\n\tint *a = new int[10];\n\treturn 50;\n}\n"))
ratio = 65907 / float(953567)
print(ratio)

int ID ( ) { int * ID = ID int [ 1 0 ] ; return 5 0 ; }
0.06911627604562658


In [6]:
sources = []
v, nv = 0, 0
import numpy as np

for idx, func in enumerate(train_file['functionSource']):
    if idx % 10000 == 0:
        print(idx, v, nv)
    if idx in vul_indices:
        tokenized = tokenize(func.strip())
        if tokenize is None:
            continue
        sources.append({'code': func.strip(), 'label': 1, 'tokenized': tokenized})
        v += 1
    else:
        r = np.random.uniform()
        if r <= 1.00:
            tokenized = tokenize(func.strip())
            if tokenize is None:
                continue
            sources.append({'code': func.strip(), 'label': 0, 'tokenized': tokenized})
            nv += 1



0 0 0
10000 630 9370
20000 1310 18690
30000 1955 28045
40000 2641 37359
50000 3245 46755
60000 3890 56110
70000 4531 65469
80000 5142 74858
90000 5765 84235
100000 6422 93578
110000 7074 102926
120000 7718 112282
130000 8403 121597
140000 9048 130952
150000 9700 140300
160000 10345 149655
170000 11001 158999
180000 11627 168373
190000 12237 177763
200000 12866 187134
210000 13510 196490
220000 14180 205820
230000 14836 215164
240000 15515 224485
250000 16124 233876
260000 16785 243215
270000 17454 252546
280000 18137 261863
290000 18781 271219
300000 19458 280542
310000 20087 289913
320000 20733 299267
330000 21409 308591
340000 22067 317933
350000 22749 327251
360000 23398 336602
370000 24070 345930
380000 24698 355302
390000 25344 364656
400000 25966 374034
410000 26581 383419
420000 27205 392795
430000 27885 402115
440000 28529 411471
450000 29179 420821
460000 29819 430181
470000 30467 439533
480000 31071 448929
490000 31669 458331
500000 32327 467673
510000 32927 477073
520000 335

In [7]:
len(sources)
import json


In [8]:
train_file_name = open('../data/draper/train_full.json', 'w')
json.dump(sources, train_file_name)
train_file_name.close()
print(sources[0])

{'code': 'clear_area(int startx, int starty, int xsize, int ysize)\n{\n  int x;\n\n  TRACE_LOG("Clearing area %d,%d / %d,%d\\n", startx, starty, xsize, ysize);\n\n  while (ysize > 0)\n  {\n    x = xsize;\n    while (x > 0)\n    {\n      mvaddch(starty + ysize - 2, startx + x - 2, \' \');\n      x--;\n    }\n    ysize--;\n  }\n}', 'label': 0, 'tokenized': 'ID ( int ID , int ID , int ID , int ID ) { int ID ; ID ( STRING , ID , ID , ID , ID ) ; while ( ID > 0 ) { ID = ID ; while ( ID > 0 ) { ID ( ID + ID - 2 , ID + ID - 2 , LITERAL ) ; ID -- ; } ID -- ; } }'}


In [16]:
def get_all(file_path):
    _file = h5py.File(file_path)
    v = 0
    nv = 0
    sources = []
    for idx, (a, b, c, d, e, f) in  enumerate(zip(
        _file['CWE-119'], _file['CWE-120'], _file['CWE-469'], 
        _file['CWE-476'], _file['CWE-other'], _file['functionSource']
    )):
        if idx % 10000 == 0:
            print(idx)
        tokenized = tokenize(f)
        if tokenized == None:
            continue
        if a or b or c or d or e:
            sources.append({
                'code': f.strip(),
                'label': 1,
                'tokenized': tokenized
            })
            v += 1
        else:
            sources.append({
                'code': f.strip(),
                'label': 0,
                'tokenized': tokenized
            })
            nv += 1
    return sources, v, nv


In [20]:
valid_file_name = '../data/draper/VDISC_validate.hdf5'
valid_data, v, nv = get_all(valid_file_name)
print(v, nv, len(valid_data), valid_data[0])

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
8109 116583 124692 {'code': 'gwy_resource_class_mkdir(GwyResourceClass *klass)\n{\n    gchar *path;\n    gint ok;\n\n    g_return_val_if_fail(GWY_IS_RESOURCE_CLASS(klass), FALSE);\n\n    path = g_build_filename(gwy_get_user_dir(), klass->name, NULL);\n    if (g_file_test(path, G_FILE_TEST_IS_DIR)) {\n        g_free(path);\n        return TRUE;\n    }\n\n    ok = !g_mkdir(path, 0700);\n    g_free(path);\n\n    return ok;\n}', 'label': 0, 'tokenized': 'ID ( ID * ID ) { ID * ID ; ID ID ; ID ( ID ( ID ) , ID ) ; ID = ID ( ID ( ) , ID -> ID , ID ) ; if ( ID ( ID , ID ) ) { ID ( ID ) ; return ID ; } ID = ! ID ( ID , 0 7 0 0 ) ; ID ( ID ) ; return ID ; }'}


In [21]:
json_file_name = open('../data/draper/valid.json', 'w')

json.dump(valid_data, json_file_name)
json_file_name.close()

In [22]:
test_file_name = '../data/draper/VDISC_test.hdf5'
test_data, v, nv = get_all(test_file_name)
print(v, nv, len(test_data))
json_file_name = open('../data/draper/test.json', 'w')

json.dump(test_data, json_file_name)
json_file_name.close()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
8127 116497 124624
