In [3]:
import os
import clang
from clang.cindex import *
from copy import deepcopy
import time

In [4]:
Config.set_library_file("/home/dipu/anaconda3/lib/python3.9/site-packages/clang/native/libclang.so")

In [1]:
# create an index and parse the source code
index = Index.create()
root_cursor = index.parse("main.c").cursor

In [2]:
def print_ast(node, indent):
    try:
        print(" "*indent + node.spelling + " " + str(node.kind) + " " + str(node.type.spelling))
        for c in node.get_children():
            print_ast(c, indent+2)
    except ValueError:
        pass

print_ast(root_cursor, 0)

main.c CursorKind.TRANSLATION_UNIT 
  __u_char CursorKind.TYPEDEF_DECL __u_char
  __u_short CursorKind.TYPEDEF_DECL __u_short
  __u_int CursorKind.TYPEDEF_DECL __u_int
  __u_long CursorKind.TYPEDEF_DECL __u_long
  __int8_t CursorKind.TYPEDEF_DECL __int8_t
  __uint8_t CursorKind.TYPEDEF_DECL __uint8_t
  __int16_t CursorKind.TYPEDEF_DECL __int16_t
  __uint16_t CursorKind.TYPEDEF_DECL __uint16_t
  __int32_t CursorKind.TYPEDEF_DECL __int32_t
  __uint32_t CursorKind.TYPEDEF_DECL __uint32_t
  __int64_t CursorKind.TYPEDEF_DECL __int64_t
  __uint64_t CursorKind.TYPEDEF_DECL __uint64_t
  __int_least8_t CursorKind.TYPEDEF_DECL __int_least8_t
    __int8_t CursorKind.TYPE_REF __int8_t
  __uint_least8_t CursorKind.TYPEDEF_DECL __uint_least8_t
    __uint8_t CursorKind.TYPE_REF __uint8_t
  __int_least16_t CursorKind.TYPEDEF_DECL __int_least16_t
    __int16_t CursorKind.TYPE_REF __int16_t
  __uint_least16_t CursorKind.TYPEDEF_DECL __uint_least16_t
    __uint16_t CursorKind.TYPE_REF __uint16_t
  __int_

In [16]:
def get_function_params(root, function_name, result):
    
    for node in root.walk_preorder():
        try:
            """
            checking if AST node belongs to function declaration
            and its name is same as passed function_name
            """
            if node.kind == CursorKind.FUNCTION_DECL \
            and node.spelling == function_name:
                # loop through its children and only append details of parameter node
                for c in node.get_children():
                    if c.kind == CursorKind.PARM_DECL:
                        result.append({"name": c.spelling, 
                                       "data_type": c.type.spelling})
                return
        except ValueError as e:
            # print("Error:", e)
            pass

In [17]:
param_list = []
get_function_params(root_cursor, "getSum", param_list)
param_list

[{'name': 'n1', 'data_type': 'int'}, {'name': 'n2', 'data_type': 'int'}]

In [18]:
def get_called_functions(root, result):
    for node in root.walk_preorder():
        try:
            if node.kind == CursorKind.CALL_EXPR:
                # "location": node.extent
                current_function = {"name": node.spelling, "return_type": node.type.spelling, "args": []}

                for c in node.get_arguments():
                    current_arg = "".join([x.spelling for x in list(c.get_tokens())]) if len(list(c.get_tokens())) > 0 else c.spelling

                    if len(current_arg) >= 3 and current_arg.startswith('\"') and current_arg.endswith('\"'):
                        current_arg = '\"' + current_arg[1:-1].replace('\"', "\"\"") + '\"'

                    current_function["args"].append({"name": current_arg, "data_type": c.type.spelling, "cursor_kind": c.kind})
                    # current_function["args"].append({"name": c.spelling, "data_type": c.type.spelling, "cursor_kind": c.kind})
                    # print(node.location)

                current_param_list = []
                if len(current_function["args"]) == 2 and \
                    (current_function["args"][0]["data_type"] == current_function["args"][1]["data_type"]):
                    get_function_params(root, node.spelling, current_param_list)
                current_function["params"] = current_param_list

                result.append(current_function)

        except ValueError:
            pass

In [19]:
function_list = []
get_called_functions(root_cursor, function_list)
function_list

[{'name': 'getSum',
  'return_type': 'void',
  'args': [{'name': 'a',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR},
   {'name': '7',
    'data_type': 'int',
    'cursor_kind': CursorKind.INTEGER_LITERAL}],
  'params': [{'name': 'n1', 'data_type': 'int'},
   {'name': 'n2', 'data_type': 'int'}]},
 {'name': 'justDoThis',
  'return_type': 'int',
  'args': [{'name': 'a',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR},
   {'name': 'b',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR}],
  'params': []},
 {'name': 'okayGood',
  'return_type': 'int',
  'args': [{'name': 'x',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR},
   {'name': '500',
    'data_type': 'int',
    'cursor_kind': CursorKind.INTEGER_LITERAL}],
  'params': []},
 {'name': 'printf',
  'return_type': 'int',
  'args': [{'name': '"Sum is: %d"',
    'data_type': 'const char *',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR},
   {'name': 'resul

In [11]:
for function in function_list:
        
        if len(function["args"]) == 2 and \
            (function["args"][0]["data_type"] == function["args"][1]["data_type"]) and \
            (function["args"][0]["name"] != function["args"][1]["name"]):
            
            positive_sample = [function["name"], function["args"][0]["name"], function["args"][1]["name"],
                              function["args"][0]["data_type"]]
            
            if(len(function["params"]) == 2):
                positive_sample.append(function["params"][0]["name"])
                positive_sample.append(function["params"][1]["name"])
            else:
                positive_sample.append("")
                positive_sample.append("")
            
            negative_sample = deepcopy(positive_sample)
            # swap
            negative_sample[1], negative_sample[2] = negative_sample[2], negative_sample[1]
            
            positive_sample.append(str(0))
            negative_sample.append(str(1))
            
            positive_sample = "\t".join(positive_sample)
            negative_sample = "\t".join(negative_sample)
            
            print(positive_sample)
            print(negative_sample)
            

getSum	a	7	int	n1	n2	0
getSum	7	a	int	n1	n2	1
justDoThis	a	b	int			0
justDoThis	b	a	int			1
okayGood	x	500	int			0
okayGood	500	x	int			1


---------
## Negative sample generation from all code snippets - Function args swap only
---------

In [12]:
def generate_function_args_swap_dataset(root_dir):
    total_files, total_samples = 0, 0
    global current_file
    
    with open("function_binary_args_swap_dataset.csv", 'a') as function_swap_samples:
        function_swap_samples.write("file\tfunction_name\targ1\targ2\targ_type\tparam1\tparam2\tlabels")
        
        for root, dirs, files in os.walk(root_dir):
                for file in files:
                    if file.endswith(".c"):
                        total_files += 1

                        file_path = os.path.join(root, file)
                        current_file = file_path
                            
                            with open(file_path, 'rb') as f:
                                
                                content = str(f.read())

                                if content.count("\\n") <= 10_000:
                                    
                                    try:
                                        start_cursor = index.parse(file_path).cursor

                                        function_list = []
                                        get_called_functions(start_cursor, function_list)

                                        for function in function_list:

                                            if len(function["args"]) == 2 and \
                                                (function["args"][0]["data_type"] == function["args"][1]["data_type"]) and \
                                                (function["args"][0]["name"] != function["args"][1]["name"]) and \
                                                (len(function["args"][0]["name"]) <= 100) and \
                                                (len(function["args"][1]["name"]) <= 100):

                                                positive_sample = [f.name.split("/AI/MinorProject/c-corpus/")[1], function["name"], function["args"][0]["name"], function["args"][1]["name"],
                                                                  function["args"][0]["data_type"]]

                                                if(len(function["params"]) == 2):
                                                    positive_sample.append(function["params"][0]["name"])
                                                    positive_sample.append(function["params"][1]["name"])
                                                else:
                                                    positive_sample.append("")
                                                    positive_sample.append("")

                                                negative_sample = deepcopy(positive_sample)
                                                # swap
                                                negative_sample[2], negative_sample[3] = negative_sample[3], negative_sample[2]

                                                positive_sample.append(str(0))
                                                negative_sample.append(str(1))

                                                positive_sample = "\t".join(positive_sample)
                                                negative_sample = "\t".join(negative_sample)

                                                function_swap_samples.write("\n" + positive_sample)
                                                function_swap_samples.write("\n" + negative_sample)
                                                
                                                total_samples += 1
                                                
                                    except:
                                        print("---Error occurred---")

                        current_file = file_path

                        if total_files % 1000 == 0:
                            print("Total files:", total_files, ",", "Total samples:", total_samples)

In [None]:
# def generate_function_args_swap_dataset(root_dir):
#     total_files, total_samples = 0, 0
#     global current_file
    
#     with open("function_binary_args_swap_dataset.csv", 'a') as function_swap_samples:
#         function_swap_samples.write("file\tfunction_name\targ1\targ2\targ_type\tparam1\tparam2\tlabels")
        
#         for root, dirs, files in os.walk(root_dir):
#                 for file in files:
#                     if file.endswith(".c"):
#                         total_files += 1

#                         file_path = os.path.join(root, file)
#                         current_file = file_path

#                         if total_files > 550_000:
                            
#                             with open(file_path, 'rb') as f:
#                                 # print(f.name)
#                                 # time.sleep(0.1)
                                
#                                 content = str(f.read())

#                                 if content.count("\\n") <= 10_000:
                                    
#                                     try:
#                                         start_cursor = index.parse(file_path).cursor

#                                         function_list = []
#                                         get_called_functions(start_cursor, function_list)

#                                         for function in function_list:

#                                             if len(function["args"]) == 2 and \
#                                                 (function["args"][0]["data_type"] == function["args"][1]["data_type"]) and \
#                                                 (function["args"][0]["name"] != function["args"][1]["name"]) and \
#                                                 (len(function["args"][0]["name"]) <= 100) and \
#                                                 (len(function["args"][1]["name"]) <= 100):

#                                                 positive_sample = [f.name.split("/AI/MinorProject/c-corpus/")[1], function["name"], function["args"][0]["name"], function["args"][1]["name"],
#                                                                   function["args"][0]["data_type"]]

#                                                 if(len(function["params"]) == 2):
#                                                     positive_sample.append(function["params"][0]["name"])
#                                                     positive_sample.append(function["params"][1]["name"])
#                                                 else:
#                                                     positive_sample.append("")
#                                                     positive_sample.append("")

#                                                 negative_sample = deepcopy(positive_sample)
#                                                 # swap
#                                                 negative_sample[2], negative_sample[3] = negative_sample[3], negative_sample[2]

#                                                 positive_sample.append(str(0))
#                                                 negative_sample.append(str(1))

#                                                 positive_sample = "\t".join(positive_sample)
#                                                 negative_sample = "\t".join(negative_sample)

#                                                 function_swap_samples.write("\n" + positive_sample)
#                                                 function_swap_samples.write("\n" + negative_sample)
                                                
#                                                 total_samples += 1
                                                
#                                     except:
#                                         print("---Error occurred---")

#                         current_file = file_path

#                         if total_files % 1000 == 0:
#                             print("Total files:", total_files, ",", "Total samples:", total_samples)

In [13]:
root_dir = '/home/dipu/Documents/AI/MinorProject/c-corpus/'

generate_function_args_swap_dataset(root_dir)

Total files: 1000 , Total samples: 0
Total files: 2000 , Total samples: 0
Total files: 3000 , Total samples: 0
Total files: 4000 , Total samples: 0
Total files: 5000 , Total samples: 0
Total files: 6000 , Total samples: 0
Total files: 7000 , Total samples: 0
Total files: 8000 , Total samples: 0
Total files: 9000 , Total samples: 0
Total files: 10000 , Total samples: 0
Total files: 11000 , Total samples: 0
Total files: 12000 , Total samples: 0
Total files: 13000 , Total samples: 0
Total files: 14000 , Total samples: 0
Total files: 15000 , Total samples: 0
Total files: 16000 , Total samples: 0
Total files: 17000 , Total samples: 0
Total files: 18000 , Total samples: 0
Total files: 19000 , Total samples: 0
Total files: 20000 , Total samples: 0
Total files: 21000 , Total samples: 0
Total files: 22000 , Total samples: 0
Total files: 23000 , Total samples: 0
Total files: 24000 , Total samples: 0
Total files: 25000 , Total samples: 0
Total files: 26000 , Total samples: 0
Total files: 27000 , 

Total files: 234000 , Total samples: 0
Total files: 235000 , Total samples: 0
Total files: 236000 , Total samples: 0
Total files: 237000 , Total samples: 0
Total files: 238000 , Total samples: 0
Total files: 239000 , Total samples: 0
Total files: 240000 , Total samples: 0
Total files: 241000 , Total samples: 0
Total files: 242000 , Total samples: 0
Total files: 243000 , Total samples: 0
Total files: 244000 , Total samples: 0
Total files: 245000 , Total samples: 0
Total files: 246000 , Total samples: 0
Total files: 247000 , Total samples: 0
Total files: 248000 , Total samples: 0
Total files: 249000 , Total samples: 0
Total files: 250000 , Total samples: 0
Total files: 251000 , Total samples: 0
Total files: 252000 , Total samples: 0
Total files: 253000 , Total samples: 0
Total files: 254000 , Total samples: 0
Total files: 255000 , Total samples: 0
Total files: 256000 , Total samples: 0
Total files: 257000 , Total samples: 0
Total files: 258000 , Total samples: 0
Total files: 259000 , Tot

Total files: 456000 , Total samples: 0
Total files: 457000 , Total samples: 0
Total files: 458000 , Total samples: 0
Total files: 459000 , Total samples: 0
Total files: 460000 , Total samples: 0
Total files: 461000 , Total samples: 0
Total files: 462000 , Total samples: 0
Total files: 463000 , Total samples: 0
Total files: 464000 , Total samples: 0
Total files: 465000 , Total samples: 0
Total files: 466000 , Total samples: 0
Total files: 467000 , Total samples: 0
Total files: 468000 , Total samples: 0
Total files: 469000 , Total samples: 0
Total files: 470000 , Total samples: 0
Total files: 471000 , Total samples: 0
Total files: 472000 , Total samples: 0
Total files: 473000 , Total samples: 0
Total files: 474000 , Total samples: 0
Total files: 475000 , Total samples: 0
Total files: 476000 , Total samples: 0
Total files: 477000 , Total samples: 0
Total files: 478000 , Total samples: 0
Total files: 479000 , Total samples: 0
Total files: 480000 , Total samples: 0
Total files: 481000 , Tot

terminate called after throwing an instance of 'std::bad_alloc'
  what():  std::bad_alloc
libclang: crash detected during parsing: {
  'source_filename' : '/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/gcc/gcc/testsuite/gcc.dg/large-size-array-4.c'
  'command_line_args' : ['clang'],
  'unsaved_files' : [],
  'options' : 0,
}


---Error occurred---


terminate called recursively
libclang: crash detected during parsing: {
  'source_filename' : '/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/gcc/gcc/testsuite/gcc.dg/large-size-array-2.c'
  'command_line_args' : ['clang'],
  'unsaved_files' : [],
  'options' : 0,
}


---Error occurred---
Total files: 552000 , Total samples: 689
Total files: 553000 , Total samples: 966
Total files: 554000 , Total samples: 1078
Total files: 555000 , Total samples: 1445
Total files: 556000 , Total samples: 2254
Total files: 557000 , Total samples: 3028
Total files: 558000 , Total samples: 6557
Total files: 559000 , Total samples: 8721
Total files: 560000 , Total samples: 10022
Total files: 561000 , Total samples: 11339
Total files: 562000 , Total samples: 12350
Total files: 563000 , Total samples: 12802
Total files: 564000 , Total samples: 14230
Total files: 565000 , Total samples: 14658
Total files: 566000 , Total samples: 15078
Total files: 567000 , Total samples: 15402
Total files: 568000 , Total samples: 17429
Total files: 569000 , Total samples: 19212
Total files: 570000 , Total samples: 20722
Total files: 571000 , Total samples: 21503
Total files: 572000 , Total samples: 22755
Total files: 573000 , Total samples: 23648
Total files: 574000 , Total samples: 23725


In [None]:
# function_args_swap_new.csv
# 385222 samples including positive and negative
# 156000 total files evaluated

# function_args_swap_full.csv
# Total files: 147000 , Total samples: 205501

# function_binary_args_swap_dataset.csv
# Total files: 550000 , Total samples: 1369283

In [3]:
# current_file
323324 * 2

646648

In [None]:
"abc\"de\"fg"[1:-1].replace('\"', "\"\"")

In [None]:
"abc".endswith("c")

In [6]:
"/home/dipu/Documents/AI/MinorProject/c-corpus/cleaned/meridian59/clientd3d/makepal.c".split("/AI/MinorProject/c-corpus/")[1]

'cleaned/meridian59/clientd3d/makepal.c'

In [4]:
len('function_swap_samples.write("file\tfunction_name\targ1\targ2\targ_type\tparam1\tparam2\tlabels")')

89

-----
## Data generation
-----

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("function_binary_args_swap_dataset.csv", sep="\t")

ParserError: Error tokenizing data. C error: Expected 8 fields in line 765582, saw 10


In [11]:
with open("function_binary_args_swap_dataset.csv", "r") as f:
    content = f.readlines()
    s = {}
    
    for line in content:
        total = len(line.split("\t"))
        
        if total in s.keys():
            s[total] += 1
        else:
            s[total] = 1
        
        if(total == 16):
            print(line.split("\t"))
    print(s)

['cleaned/cutter/test/cutter/test-cut-readable-differ.c', 'cut_diff_readable', '"', '', '  GNU LESSER GENERAL PUBLIC LICENSE\\n""""', '', '       Version 2.1, February 1999"', '"', '', '    GNU GENERAL PUBLIC LICENSE\\n""""', '', '       Version 2, June 1991"', 'char *', '', '', '0\n']
['cleaned/cutter/test/cutter/test-cut-readable-differ.c', 'cut_diff_readable', '"', '', '    GNU GENERAL PUBLIC LICENSE\\n""""', '', '       Version 2, June 1991"', '"', '', '  GNU LESSER GENERAL PUBLIC LICENSE\\n""""', '', '       Version 2.1, February 1999"', 'char *', '', '', '1\n']
{8: 1750153, 9: 64, 3: 139, 2: 88, 5: 137, 6: 49, 4: 51, 1: 14, 10: 16, 12: 4, 16: 2}


In [27]:
with open("func_args_dataset_filtered.csv", "w") as write_file:
    write_file.write("file\tfunction_name\targ1\targ2\targ_type\tparam1\tparam2\tlabels\n")
    
    with open("function_binary_args_swap_dataset.csv", "r") as f:
        total, count = 0, 0
        content = f.readlines()

        for line in content:
            tokens = line.split("\t")

            if(len(tokens) == 8 and tokens[-1] == "0\n"):
                write_file.write(line)
                count += 1
                
            total += 1
                
        print(total, count)

1750717 875075


In [None]:
# total: 1750716, filtered: 875075*2

In [1]:
import numpy as np
import pandas as pd

In [175]:
df = pd.read_csv("func_args_dataset_filtered.csv", sep="\t")
df

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/meridian59/clientd3d/makepal.c,fopen,file,r,const char *,__filename,__modes,0
1,cleaned/meridian59/clientd3d/makepal.c,fopen,tempbuf,wt,const char *,__filename,__modes,0
2,cleaned/meridian59/clientd3d/cursor.c,MouseToRoom,&x,&y,int *,,,0
3,cleaned/meridian59/clientd3d/loadrsc.c,stricmp,(char*)f1,(char*)f2,char *,,,0
4,cleaned/meridian59/clientd3d/util.c,strcmp,ext+1,extension,const char *,,,0
...,...,...,...,...,...,...,...,...
875070,cleaned/notepadthing/build/psycopg2/psycopg/ty...,strcmp,str,-infinity,const char *,,,0
875071,cleaned/notepadthing/build/psycopg2/psycopg/co...,strcmp,off,scs,const char *,__s1,__s2,0
875072,cleaned/notepadthing/build/psycopg2/psycopg/co...,Dprintf,conn_connect: server requires E'' quotes: %s,"equote?""YES"":""NO""",char *,,,0
875073,cleaned/notepadthing/build/psycopg2/psycopg/pq...,Dprintf,pq_resolve_critical: error = %s,msg,char *,,,0


In [176]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/util-linux/term-utils/setterm.c,strcmp,argv[argc-1],yellow,const char *,__s1,__s2,0
1,cleaned/b2g-python/host/Python-2.7.3/Modules/c...,WRITE2,'~','}',int,,,0
2,cleaned/ngp/ngp.c,display_entries,index,cursor,int *,index,cursor,0
3,cleaned/msm7x30-3.4.x-naa/net/netfilter/xt_has...,hlist_add_head_rcu,&ent->node,"&ht->hash[hash_dst(ht,dst)]",<dependent type>,,,0
4,cleaned/lldpd/tests/check_marshal.c,ck_assert_str_eq,destination->s3,String 3,char *,,,0
...,...,...,...,...,...,...,...,...
875070,cleaned/fvwm/modules/FvwmScript/script.c,AddCom,24,2,int,Type,NbLevelArg,0
875071,cleaned/zfs-crypto/module/zfs/spa_misc.c,strpbrk,name,/@,const char *,,,0
875072,cleaned/mplayer-ww/libaf/af_resample.c,av_gcd,up,dn,int64_t,a,b,0
875073,cleaned/sslcaudit/openssl-1.0.1b/crypto/x509v3...,ASN1_INTEGER_cmp,a_min,a_max,const ASN1_INTEGER *,x,y,0


In [177]:
df.columns[1:-1]

Index(['function_name', 'arg1', 'arg2', 'arg_type', 'param1', 'param2'], dtype='object')

In [178]:
df.drop_duplicates(subset=df.columns[1:-1], ignore_index=True, inplace=True)
df

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/util-linux/term-utils/setterm.c,strcmp,argv[argc-1],yellow,const char *,__s1,__s2,0
1,cleaned/b2g-python/host/Python-2.7.3/Modules/c...,WRITE2,'~','}',int,,,0
2,cleaned/ngp/ngp.c,display_entries,index,cursor,int *,index,cursor,0
3,cleaned/msm7x30-3.4.x-naa/net/netfilter/xt_has...,hlist_add_head_rcu,&ent->node,"&ht->hash[hash_dst(ht,dst)]",<dependent type>,,,0
4,cleaned/lldpd/tests/check_marshal.c,ck_assert_str_eq,destination->s3,String 3,char *,,,0
...,...,...,...,...,...,...,...,...
270292,cleaned/opensplice/setup/wrappers/wincmd/ospl_...,addarg,-outputresource:%s,&arg[16],char *,pattern,val,0
270293,cleaned/fvwm/tests/hints/hints_test.c,strcasecmp,argv[i],modal,const char *,__s1,__s2,0
270294,cleaned/libmate/libmate/mate-config.c,config_concat_dir_and_key,mate/config,rel_file,const char *,dir,key,0
270295,cleaned/tinc/src/tincd.c,strcasecmp,optarg,HUP,const char *,__s1,__s2,0


In [179]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [180]:
df_train

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/reconnoiter/src/stratcon_iep.c,calloc,1,sizeof(*stmt),unsigned long,,,0
1,cleaned/android-external-qemu/target-i386/help...,strcmp,name,x86_defs[i].name,const char *,__s1,__s2,0
2,cleaned/open-watcom/bld/clib/string/c/strtest.c,strcmp,bufA,newBuf,const char *,__s1,__s2,0
3,cleaned/playbook-dev-tools/bootstrap/gcc/gcc/g...,punpckhwd_u,s.v,t.v,<dependent type>,,,0
4,cleaned/qemu-devel-1.4.0/work/qemu-1.4.0/roms/...,peekb,0x40,0x62,int,,,0
...,...,...,...,...,...,...,...,...
243262,cleaned/showtime/src/showtime.c,strcmp,argv[0],--ffmpeglog,const char *,__s1,__s2,0
243263,cleaned/socat/xioopts.c,Error1,"getgrnam(\""%s\""): no such group",token,char *,,,0
243264,cleaned/dd-wrt/ar5315_microredboot/microredboo...,error,invalid --param option: %s,arg,char *,,,0
243265,cleaned/dd-wrt/src/router/libutils/utils.c,strcmp,bridge,ifname,const char *,__s1,__s2,0


In [181]:
df_test

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/tvheadend/src/utils.c,printf,%s:,pfx,const char *,,,0
1,cleaned/asuswrt-merlin/release/src/router/samb...,strcmp,sidstr,sid_codes[i].sid,const char *,,,0
2,cleaned/linux/drivers/media/video/hdpvr/hdpvr-...,list_move_tail,&buf->buff_list,&dev->rec_buff_list,<dependent type>,,,0
3,cleaned/unvanquished/src/tools/owmap/path_init.c,Q_stricmp,arg,quake2,char *,,,0
4,cleaned/uwsgi/core/utils.c,strstr,src,what,const char *,,,0
...,...,...,...,...,...,...,...,...
27025,cleaned/vsr/externals/gl2ps/gl2ps.c,gl2psPrintf,[] 0 %s\n,str,const char *,,,0
27026,cleaned/bahamut/src/s_conf.c,DupString,MeLine->admin[1],new_MeLine->admin[1],<dependent type>,,,0
27027,cleaned/kmscon/src/uterm_input.c,shl_dlist_link,&input->devices,&dev->list,<dependent type>,,,0
27028,cleaned/opensc/src/libopensc/card-openpgp.c,calloc,1,sizeof*priv,unsigned long,__nmemb,__size,0


In [182]:
print("Total:", len(df_train.index))

neg_train_array = []
count = 0

for index, row in df_train.iterrows():
    row["arg1"], row["arg2"] = row["arg2"], row["arg1"]
    row["labels"] = 1
    
    neg_train_array.append(row.values)
    
    count += 1
    if count % 25000 == 0:
        print(count)

df_train_neg = pd.DataFrame(neg_train_array, columns=df.columns)
df_train = pd.concat([df_train, df_train_neg])
df_train

Total: 243267
25000
50000
75000
100000
125000
150000
175000
200000
225000


Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/reconnoiter/src/stratcon_iep.c,calloc,1,sizeof(*stmt),unsigned long,,,0
1,cleaned/android-external-qemu/target-i386/help...,strcmp,name,x86_defs[i].name,const char *,__s1,__s2,0
2,cleaned/open-watcom/bld/clib/string/c/strtest.c,strcmp,bufA,newBuf,const char *,__s1,__s2,0
3,cleaned/playbook-dev-tools/bootstrap/gcc/gcc/g...,punpckhwd_u,s.v,t.v,<dependent type>,,,0
4,cleaned/qemu-devel-1.4.0/work/qemu-1.4.0/roms/...,peekb,0x40,0x62,int,,,0
...,...,...,...,...,...,...,...,...
243262,cleaned/showtime/src/showtime.c,strcmp,--ffmpeglog,argv[0],const char *,__s1,__s2,1
243263,cleaned/socat/xioopts.c,Error1,token,"getgrnam(\""%s\""): no such group",char *,,,1
243264,cleaned/dd-wrt/ar5315_microredboot/microredboo...,error,arg,invalid --param option: %s,char *,,,1
243265,cleaned/dd-wrt/src/router/libutils/utils.c,strcmp,ifname,bridge,const char *,__s1,__s2,1


In [183]:
print("Total:", len(df_test.index))

neg_test_array = []
count = 0

for index, row in df_test.iterrows():
    row["arg1"], row["arg2"] = row["arg2"], row["arg1"]
    row["labels"] = 1
    
    neg_test_array.append(row.values)
    
    count += 1
    if count % 25000 == 0:
        print(count)

df_test_neg = pd.DataFrame(neg_test_array, columns=df.columns)
df_test = pd.concat([df_test, df_test_neg])
df_test

Total: 27030
25000


Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/tvheadend/src/utils.c,printf,%s:,pfx,const char *,,,0
1,cleaned/asuswrt-merlin/release/src/router/samb...,strcmp,sidstr,sid_codes[i].sid,const char *,,,0
2,cleaned/linux/drivers/media/video/hdpvr/hdpvr-...,list_move_tail,&buf->buff_list,&dev->rec_buff_list,<dependent type>,,,0
3,cleaned/unvanquished/src/tools/owmap/path_init.c,Q_stricmp,arg,quake2,char *,,,0
4,cleaned/uwsgi/core/utils.c,strstr,src,what,const char *,,,0
...,...,...,...,...,...,...,...,...
27025,cleaned/vsr/externals/gl2ps/gl2ps.c,gl2psPrintf,str,[] 0 %s\n,const char *,,,1
27026,cleaned/bahamut/src/s_conf.c,DupString,new_MeLine->admin[1],MeLine->admin[1],<dependent type>,,,1
27027,cleaned/kmscon/src/uterm_input.c,shl_dlist_link,&dev->list,&input->devices,<dependent type>,,,1
27028,cleaned/opensc/src/libopensc/card-openpgp.c,calloc,sizeof*priv,1,unsigned long,__nmemb,__size,1


In [184]:
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_train

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/collectd/src/amqp.c,INFO,conf->queue,"amqp plugin: Created queue \""%s\"".",char *,,,1
1,cleaned/bspwm/events.c,MAX,1,h,int,,,0
2,cleaned/newlib/newlib/libm/math/sf_signif.c,__ieee754_scalbf,(float)-ilogbf(x),x,double,,,1
3,cleaned/dd-wrt/src/router/proftpd/contrib/mod_...,crypt,ciphertext,plaintext,const char *,,,1
4,cleaned/tpm-tools/src/tpm_mgmt/tpm_clearable.c,logDebug,"_(""Requested to disable: %s ability.\n"")",_(flags[i].name),int,,,0
...,...,...,...,...,...,...,...,...
486529,cleaned/libgit2/tests-clar/core/path.c,check_dirname,/,/usr,const char *,A,B,1
486530,cleaned/asuswrt-merlin/release/src-rt/linux/li...,outw,ioaddr+IO_PORT,eaddrs[1],int,,,1
486531,cleaned/zif/libzif/zif-self-test.c,zif_compare_evr,0.1,0:0.1-1,char *,,,0
486532,cleaned/fvwm/fvwm/windowlist.c,StrEquals,tok,NoDeskSort,char *,,,0


In [185]:
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
df_test

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/powerswitch-os/vga.c,outb,mode13[2][i],0x3cf,int,,,1
1,cleaned/platform_external/icu4c/test/cintltst/...,strcmp,subBundleKey,availableFormats,const char *,,,0
2,cleaned/gnushogi/xshogi/xshogi.c,strcmp,name,Yes,const char *,__s1,__s2,0
3,cleaned/asuswrt-merlin/release/src/router/samb...,DEBUG,0,"(""%s: bad validation_level value %d.\n"",fn,(in...",int,,,0
4,cleaned/tos-sfsu-fall2012/kernel/shell.c,is_command,command,pong,char *,s1,s2,0
...,...,...,...,...,...,...,...,...
54055,cleaned/lucy/clownfish/compiler/src/CFCType.c,strcmp,specifier,int64_t,const char *,__s1,__s2,0
54056,cleaned/opensplice/src/services/ddsi2/code/nn_...,avl_delete,"onode_from_node(tree,n)",tree,void *,vtree,vnode,1
54057,cleaned/marss/qemu/hw/etraxfs_pic.c,qemu_set_irq,!!fs->regs[R_R_NMI],fs->parent_nmi,<dependent type>,,,1
54058,cleaned/sane-backends/backend/genesys_gl847.c,SETREG,0x9d,0x06,int,,,0


In [186]:
df_train.to_csv("func_args_dataset_filtered_train.csv", sep="\t", index=False)
df_test.to_csv("func_args_dataset_filtered_test.csv", sep="\t", index=False)

In [189]:
pd.read_csv("func_args_dataset_filtered_train.csv", sep="\t")

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/collectd/src/amqp.c,INFO,conf->queue,"amqp plugin: Created queue \""%s\"".",char *,,,1
1,cleaned/bspwm/events.c,MAX,1,h,int,,,0
2,cleaned/newlib/newlib/libm/math/sf_signif.c,__ieee754_scalbf,(float)-ilogbf(x),x,double,,,1
3,cleaned/dd-wrt/src/router/proftpd/contrib/mod_...,crypt,ciphertext,plaintext,const char *,,,1
4,cleaned/tpm-tools/src/tpm_mgmt/tpm_clearable.c,logDebug,"_(""Requested to disable: %s ability.\n"")",_(flags[i].name),int,,,0
...,...,...,...,...,...,...,...,...
486529,cleaned/libgit2/tests-clar/core/path.c,check_dirname,/,/usr,const char *,A,B,1
486530,cleaned/asuswrt-merlin/release/src-rt/linux/li...,outw,ioaddr+IO_PORT,eaddrs[1],int,,,1
486531,cleaned/zif/libzif/zif-self-test.c,zif_compare_evr,0.1,0:0.1-1,char *,,,0
486532,cleaned/fvwm/fvwm/windowlist.c,StrEquals,tok,NoDeskSort,char *,,,0


In [190]:
pd.read_csv("func_args_dataset_filtered_test.csv", sep="\t")

Unnamed: 0,file,function_name,arg1,arg2,arg_type,param1,param2,labels
0,cleaned/powerswitch-os/vga.c,outb,mode13[2][i],0x3cf,int,,,1
1,cleaned/platform_external/icu4c/test/cintltst/...,strcmp,subBundleKey,availableFormats,const char *,,,0
2,cleaned/gnushogi/xshogi/xshogi.c,strcmp,name,Yes,const char *,__s1,__s2,0
3,cleaned/asuswrt-merlin/release/src/router/samb...,DEBUG,0,"(""%s: bad validation_level value %d.\n"",fn,(in...",int,,,0
4,cleaned/tos-sfsu-fall2012/kernel/shell.c,is_command,command,pong,char *,s1,s2,0
...,...,...,...,...,...,...,...,...
54055,cleaned/lucy/clownfish/compiler/src/CFCType.c,strcmp,specifier,int64_t,const char *,__s1,__s2,0
54056,cleaned/opensplice/src/services/ddsi2/code/nn_...,avl_delete,"onode_from_node(tree,n)",tree,void *,vtree,vnode,1
54057,cleaned/marss/qemu/hw/etraxfs_pic.c,qemu_set_irq,!!fs->regs[R_R_NMI],fs->parent_nmi,<dependent type>,,,1
54058,cleaned/sane-backends/backend/genesys_gl847.c,SETREG,0x9d,0x06,int,,,0
