In [60]:
import os
import clang
from clang.cindex import *
from copy import deepcopy
import time

import numpy as np
import pandas as pd

from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [2]:
Config.set_library_file("/home/dipu/anaconda3/lib/python3.9/site-packages/clang/native/libclang.so")

In [87]:
id2label = {0: "CORRECT", 1: "BUGGY"}
label2id = {"CORRECT": 0, "BUGGY": 1}

In [59]:
tokenizer = AutoTokenizer.from_pretrained("../app/backend/function-args-swap-bug/tokenizer")

function_args_swap_bug_model = AutoModelForSequenceClassification.from_pretrained("../app/backend/function-args-swap-bug/model",
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

wrong_binary_operator_bug_model = AutoModelForSequenceClassification.from_pretrained("../app/backend/wrong-binary-operator-bug/model",
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
# function args swap bug classifier
fosb_classifier = pipeline("text-classification", model=function_args_swap_bug_model, tokenizer=tokenizer)

# wrong binary operator bug classifier
wbob_classifier = pipeline("text-classification", model=wrong_binary_operator_bug_model, tokenizer=tokenizer)

In [125]:
code = """
#include <stdio.h>


void getSum(int n1, int n2)
{
	int sum = n1 + n2;
	return sum;
}

int main()
{
	int a = 5, b = 7, x = 6;
	int result = 555;
    getSum(a, 7);
    char result[] = "test\0";
    int test = justDoThis(a, b);
    okayGood(x, 500);
	printf("Sum is: %d", result);
	return 0;
}
"""

In [126]:
with open('evaluation.c', 'w') as f:
    f.write(code)
    
index = clang.cindex.Index.create()
root_cursor = index.parse('evaluation.c').cursor

---
## Generate data for function args swap bug
---

In [149]:
def get_function_params(root, function_name, result):
    """
    A function to get details of function parameter from Abstract Syntax Tree
    
    Parameters
    ----------
    root : clang.cindex.Cursor
        root of Abstract Syntax Tree
    function_name : str
        name of function whose parameters' detail is needed
    result : list
        list to store the parameter details

    Return
    ------
    void
    """
    
    for node in root.walk_preorder():
        try:
            """
            checking if AST node belongs to function declaration
            and its name is same as passed function_name
            """
            if node.kind == CursorKind.FUNCTION_DECL \
            and node.spelling == function_name:
                # loop through its children and only append details of parameter node
                for c in node.get_children():
                    if c.kind == CursorKind.PARM_DECL:
                        result.append({"name": c.spelling, 
                                       "data_type": c.type.spelling})
                return
        except ValueError as e:
            # print("Error:", e)
            pass

In [150]:
def get_called_functions(root, result):
    for node in root.walk_preorder():
        try:
            if node.kind == CursorKind.CALL_EXPR:
                # "location": node.extent
                current_function = {"name": node.spelling, "return_type": node.type.spelling, "args": [], "location": node.extent}

                for c in node.get_arguments():
                    current_arg = "".join([x.spelling for x in list(c.get_tokens())]) if len(list(c.get_tokens())) > 0 else c.spelling

                    current_function["args"].append({"name": current_arg, "data_type": c.type.spelling, "cursor_kind": c.kind})
                    # current_function["args"].append({"name": c.spelling, "data_type": c.type.spelling, "cursor_kind": c.kind})
                    # print(node.location)

                current_param_list = []
                if len(current_function["args"]) == 2 and \
                    (current_function["args"][0]["data_type"] == current_function["args"][1]["data_type"]):
                    get_function_params(root, node.spelling, current_param_list)
                current_function["params"] = current_param_list

                result.append(current_function)

        except ValueError:
            pass

In [151]:
function_list = []
get_called_functions(root_cursor, function_list)
function_list

[{'name': 'getSum',
  'return_type': 'void',
  'args': [{'name': 'a',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR},
   {'name': '7',
    'data_type': 'int',
    'cursor_kind': CursorKind.INTEGER_LITERAL}],
  'location': <SourceRange start <SourceLocation file 'evaluation.c', line 15, column 5>, end <SourceLocation file 'evaluation.c', line 15, column 17>>,
  'params': [{'name': 'n1', 'data_type': 'int'},
   {'name': 'n2', 'data_type': 'int'}]},
 {'name': 'justDoThis',
  'return_type': 'int',
  'args': [{'name': 'a',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR},
   {'name': 'b',
    'data_type': 'int',
    'cursor_kind': CursorKind.UNEXPOSED_EXPR}],
  'location': <SourceRange start <SourceLocation file 'evaluation.c', line 17, column 16>, end <SourceLocation file 'evaluation.c', line 17, column 32>>,
  'params': []},
 {'name': 'okayGood',
  'return_type': 'int',
  'args': [{'name': 'x',
    'data_type': 'int',
    'cursor_kind': CursorKi

In [152]:
function_args_swap_bug_data = []

for function in function_list:
        
        if len(function["args"]) == 2 and \
            (function["args"][0]["data_type"] == function["args"][1]["data_type"]) and \
            (function["args"][0]["name"] != function["args"][1]["name"]):
            
            sample = [function["name"], function["args"][0]["name"], function["args"][1]["name"],
                              function["args"][0]["data_type"]]
            
            if(len(function["params"]) == 2):
                sample.append(function["params"][0]["name"])
                sample.append(function["params"][1]["name"])
            else:
                sample.append("")
                sample.append("")
                
            loc = function["location"]
            sample += [str(loc.start.line), str(loc.start.column), str(loc.end.line), str(loc.end.column)]
            
            function_args_swap_bug_data.append(sample)

In [153]:
function_args_swap_bug_data

[['getSum', 'a', '7', 'int', 'n1', 'n2', '15', '5', '15', '17'],
 ['justDoThis', 'a', 'b', 'int', '', '', '17', '16', '17', '32'],
 ['okayGood', 'x', '500', 'int', '', '', '18', '5', '18', '21']]

In [154]:
df = pd.DataFrame(function_args_swap_bug_data,
                columns=["function_name", "arg1", "arg2", "arg_type", "param1", "param2", "start_line", "start_column", "end_line", "end_column"])
df

Unnamed: 0,function_name,arg1,arg2,arg_type,param1,param2,start_line,start_column,end_line,end_column
0,getSum,a,7,int,n1,n2,15,5,15,17
1,justDoThis,a,b,int,,,17,16,17,32
2,okayGood,x,500,int,,,18,5,18,21


In [155]:
df['full_text'] = df['function_name'] + tokenizer.sep_token + df['arg1'] + tokenizer.sep_token + df['arg2'] + tokenizer.sep_token + df['arg_type'] + tokenizer.sep_token + df['param1'] + tokenizer.sep_token + df['param2']
df

Unnamed: 0,function_name,arg1,arg2,arg_type,param1,param2,start_line,start_column,end_line,end_column,full_text
0,getSum,a,7,int,n1,n2,15,5,15,17,getSum</s>a</s>7</s>int</s>n1</s>n2
1,justDoThis,a,b,int,,,17,16,17,32,justDoThis</s>a</s>b</s>int</s></s>
2,okayGood,x,500,int,,,18,5,18,21,okayGood</s>x</s>500</s>int</s></s>


In [156]:
function_args_swap_bug_df = df

---
## Generate data for wrong binary operator bug
---

In [157]:
def get_binary_expressions(node, parent, grandparent, result):
    try:
        if node.kind == CursorKind.BINARY_OPERATOR:

            children_list = [i for i in node.get_children()]

            if len(children_list) == 2:
                left_offset = len([i for i in children_list[0].get_tokens()])
                operator_name = [i for i in node.get_tokens()][left_offset].spelling

                current_operation = {
                                     "operator": operator_name,
                                     "operands": [],
                                     "parent": parent.kind.name if parent is not None else "",
                                     "grandparent": grandparent.kind.name if grandparent is not None else "",
                                     "location": node.extent
                                    }

                for c in children_list:
                    """ To only allow binary operation between single operators on left and right """
                    if c.kind == CursorKind.BINARY_OPERATOR or c.kind == CursorKind.PAREN_EXPR:
                        current_operation = {}
                        break

                    operand = "".join([x.spelling for x in list(c.get_tokens())]) if len(list(c.get_tokens())) > 0 else c.spelling
                    
                    if len(operand) >= 3 and operand.startswith('\"') and operand.endswith('\"'):
                        operand = '\"' + current_arg[1:-1].replace('\"', "\"\"") + '\"'

                    current_operation["operands"].append({"name": operand, "data_type": c.type.spelling, "cursor_kind": c.kind.name})

                if current_operation != {}:
                    result.append(current_operation)
        
        for c in node.get_children():
            get_binary_expressions(c, node, parent, result)

    except ValueError:
        pass

In [158]:
binary_operation_list = []

get_binary_expressions(root_cursor, None, None, binary_operation_list)

binary_operation_list

[{'operator': '+',
  'operands': [{'name': 'n1',
    'data_type': 'int',
    'cursor_kind': 'UNEXPOSED_EXPR'},
   {'name': 'n2', 'data_type': 'int', 'cursor_kind': 'UNEXPOSED_EXPR'}],
  'parent': 'VAR_DECL',
  'grandparent': 'DECL_STMT',
  'location': <SourceRange start <SourceLocation file 'evaluation.c', line 7, column 12>, end <SourceLocation file 'evaluation.c', line 7, column 19>>}]

In [159]:
wrong_binary_operator_bug_data = []

for operation in binary_operation_list:
    loc = operation["location"]
            
    sample = [operation["operands"][0]["name"], operation["operator"], operation["operands"][1]["name"],
                operation["operands"][0]["data_type"], operation["operands"][1]["data_type"],
                operation["parent"], operation["grandparent"],
                str(loc.start.line), str(loc.start.column), str(loc.end.line), str(loc.end.column)]

    wrong_binary_operator_bug_data.append(sample)

In [160]:
wrong_binary_operator_bug_data

[['n1',
  '+',
  'n2',
  'int',
  'int',
  'VAR_DECL',
  'DECL_STMT',
  '7',
  '12',
  '7',
  '19']]

In [161]:
df = pd.DataFrame(wrong_binary_operator_bug_data,
                columns=["left", "operator", "right", "type_left", "type_right", "parent", "grandparent", "start_line", "start_column", "end_line", "end_column"])
df

Unnamed: 0,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column
0,n1,+,n2,int,int,VAR_DECL,DECL_STMT,7,12,7,19


In [162]:
df['full_text'] = df['left'] + tokenizer.sep_token + df['operator'] + tokenizer.sep_token + df['right'] + tokenizer.sep_token + df['type_left'] + tokenizer.sep_token + df['type_right'] + tokenizer.sep_token + df['parent'] + tokenizer.sep_token + df['grandparent']
df

Unnamed: 0,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,full_text
0,n1,+,n2,int,int,VAR_DECL,DECL_STMT,7,12,7,19,n1</s>+</s>n2</s>int</s>int</s>VAR_DECL</s>DEC...


In [163]:
wrong_binary_operator_bug_df = df

---
## Model Inference
---

In [166]:
fosb_result = fosb_classifier(list(function_args_swap_bug_df.full_text))
fosb_result

[{'label': 'CORRECT', 'score': 0.9998936653137207},
 {'label': 'CORRECT', 'score': 0.5038129091262817},
 {'label': 'CORRECT', 'score': 0.9967055916786194}]

In [167]:
wbob_result = wbob_classifier(list(wrong_binary_operator_bug_df.full_text))
wbob_result

[{'label': 'CORRECT', 'score': 0.7860645651817322}]

In [168]:
function_args_swap_bug_df.drop(columns=["arg_type", "param1", "param2", "full_text"], inplace=True)
wrong_binary_operator_bug_df.drop(columns=["type_left", "type_right", "parent", "grandparent", "full_text"], inplace=True)

In [169]:
function_args_swap_bug_df["label"] = [label2id[element["label"]] for element in fosb_result]
function_args_swap_bug_df["probability"] = [element["score"] for element in fosb_result]
function_args_swap_bug_df

Unnamed: 0,function_name,arg1,arg2,start_line,start_column,end_line,end_column,label,probability
0,getSum,a,7,15,5,15,17,0,0.999894
1,justDoThis,a,b,17,16,17,32,0,0.503813
2,okayGood,x,500,18,5,18,21,0,0.996706


In [170]:
wrong_binary_operator_bug_df["label"] = [label2id[element["label"]] for element in wbob_result]
wrong_binary_operator_bug_df["probability"] = [element["score"] for element in wbob_result]
wrong_binary_operator_bug_df

Unnamed: 0,left,operator,right,start_line,start_column,end_line,end_column,label,probability
0,n1,+,n2,7,12,7,19,0,0.786065


In [181]:
[dict(row) for index, row in function_args_swap_bug_df.iterrows()]

[{'function_name': 'getSum',
  'arg1': 'a',
  'arg2': '7',
  'start_line': '15',
  'start_column': '5',
  'end_line': '15',
  'end_column': '17',
  'label': 0,
  'probability': 0.9998936653137207},
 {'function_name': 'justDoThis',
  'arg1': 'a',
  'arg2': 'b',
  'start_line': '17',
  'start_column': '16',
  'end_line': '17',
  'end_column': '32',
  'label': 0,
  'probability': 0.5038129091262817},
 {'function_name': 'okayGood',
  'arg1': 'x',
  'arg2': '500',
  'start_line': '18',
  'start_column': '5',
  'end_line': '18',
  'end_column': '21',
  'label': 0,
  'probability': 0.9967055916786194}]

In [182]:
[dict(row) for index, row in wrong_binary_operator_bug_df.iterrows()]

[{'left': 'n1',
  'operator': '+',
  'right': 'n2',
  'start_line': '7',
  'start_column': '12',
  'end_line': '7',
  'end_column': '19',
  'label': 0,
  'probability': 0.7860645651817322}]

In [None]:
output = {"analysis": {
    "function_args_swap_bug": [dict(row) for index, row in function_args_swap_bug_df.iterrows()],
    "wrong_binary_operator_bug": [dict(row) for index, row in wrong_binary_operator_bug_df.iterrows()]
}}