# Devign

In [1]:
import json
import torch
import glob

import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
import argparse
import gc
import shutil

## Configuration

In [2]:
class Config(object):
    def __init__(self, config, file_path="dataset/Devign/configs.json"):
        with open(file_path) as config_file:
            self._config = json.load(config_file)
            self._config = self._config.get(config)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def get_property(self, property_name):
        return self._config.get(property_name)

    def get_device(self):
        return self.device

    def all(self):
        return self._config


class Create(Config):
    def __init__(self):
        super().__init__('create')

    @property
    def filter_column_value(self):
        return self.get_property('filter_project')

    @property
    def slice_size(self):
        return self.get_property('slice_size')

    @property
    def joern_cli_dir(self):
        return self.get_property('joern_cli_dir')


class Data(Config):
    def __init__(self, config):
        super().__init__(config)

    @property
    def cpg(self):
        return self.get_property('cpg')

    @property
    def raw(self):
        return self.get_property('raw')

    @property
    def input(self):
        return self.get_property('input')

    @property
    def model(self):
        return self.get_property('model')

    @property
    def tokens(self):
        return self.get_property('tokens')

    @property
    def w2v(self):
        return self.get_property('w2v')


class Paths(Data):
    def __init__(self):
        super().__init__('paths')

    @property
    def joern(self):
        return self.get_property('joern')


class Files(Data):
    def __init__(self):
        super().__init__('files')

    @property
    def tokens(self):
        return self.get_property('tokens')

    @property
    def w2v(self):
        return self.get_property('w2v')


class Embed(Config):
    def __init__(self):
        super().__init__('embed')

    @property
    def nodes_dim(self):
        return self.get_property('nodes_dim')

    @property
    def w2v_args(self):
        return self.get_property('word2vec_args')

    @property
    def edge_type(self):
        return self.get_property('edge_type')


class Process(Config):
    def __init__(self):
        super().__init__('process')

    @property
    def epochs(self):
        return self.get_property('epochs')

    @property
    def patience(self):
        return self.get_property('patience')

    @property
    def batch_size(self):
        return self.get_property('batch_size')

    @property
    def dataset_ratio(self):
        return self.get_property('dataset_ratio')

    @property
    def shuffle(self):
        return self.get_property('shuffle')


class Devign_class(Config):
    def __init__(self):
        super().__init__('devign')

    @property
    def learning_rate(self):
        return self.get_property('learning_rate')

    @property
    def weight_decay(self):
        return self.get_property('weight_decay')

    @property
    def loss_lambda(self):
        return self.get_property('loss_lambda')

    @property
    def model(self):
        return self.get_property('model')

In [3]:
PATHS = Paths()
FILES = Files()
DEVICE = FILES.get_device()

In [4]:
import re
import codecs

# Clean Gadget
# Author https://github.com/johnb110/VDPython:
# For each gadget, replaces all user variables with "VAR#" and user functions with "FUN#"
# Removes content from string and character literals keywords up to C11 and C++17; immutable set
from typing import List

keywords = frozenset({'__asm', '__builtin', '__cdecl', '__declspec', '__except', '__export', '__far16', '__far32',
                    '__fastcall', '__finally', '__import', '__inline', '__int16', '__int32', '__int64', '__int8',
                    '__leave', '__optlink', '__packed', '__pascal', '__stdcall', '__system', '__thread', '__try',
                    '__unaligned', '_asm', '_Builtin', '_Cdecl', '_declspec', '_except', '_Export', '_Far16',
                    '_Far32', '_Fastcall', '_finally', '_Import', '_inline', '_int16', '_int32', '_int64',
                    '_int8', '_leave', '_Optlink', '_Packed', '_Pascal', '_stdcall', '_System', '_try', 'alignas',
                    'alignof', 'and', 'and_eq', 'asm', 'auto', 'bitand', 'bitor', 'bool', 'break', 'case',
                    'catch', 'char', 'char16_t', 'char32_t', 'class', 'compl', 'const', 'const_cast', 'constexpr',
                    'continue', 'decltype', 'default', 'delete', 'do', 'double', 'dynamic_cast', 'else', 'enum',
                    'explicit', 'export', 'extern', 'false', 'final', 'float', 'for', 'friend', 'goto', 'if',
                    'inline', 'int', 'long', 'mutable', 'namespace', 'new', 'noexcept', 'not', 'not_eq', 'nullptr',
                    'operator', 'or', 'or_eq', 'override', 'private', 'protected', 'public', 'register',
                    'reinterpret_cast', 'return', 'short', 'signed', 'sizeof', 'static', 'static_assert',
                    'static_cast', 'struct', 'switch', 'template', 'this', 'thread_local', 'throw', 'true', 'try',
                    'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'void', 'volatile',
                    'wchar_t', 'while', 'xor', 'xor_eq', 'NULL'})
# holds known non-user-defined functions; immutable set
main_set = frozenset({'main'})
# arguments in main function; immutable set
main_args = frozenset({'argc', 'argv'})

operators3 = {'<<=', '>>='}
operators2 = {
    '->', '++', '--', '**',
    '!~', '<<', '>>', '<=', '>=',
    '==', '!=', '&&', '||', '+=',
    '-=', '*=', '/=', '%=', '&=', '^=', '|='
}
operators1 = {
    '(', ')', '[', ']', '.',
    '+', '&',
    '%', '<', '>', '^', '|',
    '=', ',', '?', ':',
    '{', '}', '!', '~'
}

def to_regex(lst):
    return r'|'.join([f"({re.escape(el)})" for el in lst])


regex_split_operators = to_regex(operators3) + to_regex(operators2) + to_regex(operators1)


# input is a list of string lines
def clean_gadget(gadget):
    # dictionary; map function name to symbol name + number
    fun_symbols = {}
    # dictionary; map variable name to symbol name + number
    var_symbols = {}

    fun_count = 1
    var_count = 1

    # regular expression to find function name candidates
    rx_fun = re.compile(r'\b([_A-Za-z]\w*)\b(?=\s*\()')
    # regular expression to find variable name candidates
    # rx_var = re.compile(r'\b([_A-Za-z]\w*)\b(?!\s*\()')
    rx_var = re.compile(r'\b([_A-Za-z]\w*)\b((?!\s*\**\w+))(?!\s*\()')

    # final cleaned gadget output to return to interface
    cleaned_gadget = []

    for line in gadget:
        # replace any non-ASCII characters with empty string
        ascii_line = re.sub(r'[^\x00-\x7f]', r'', line)
        # remove all hexadecimal literals
        hex_line = re.sub(r'0[xX][0-9a-fA-F]+', "HEX", ascii_line)
        # return, in order, all regex matches at string list; preserves order for semantics
        user_fun = rx_fun.findall(hex_line)
        user_var = rx_var.findall(hex_line)

        # Could easily make a "clean gadget" type class to prevent duplicate functionality
        # of creating/comparing symbol names for functions and variables in much the same way.
        # The comparison frozenset, symbol dictionaries, and counters would be class scope.
        # So would only need to pass a string list and a string literal for symbol names to
        # another function.
        for fun_name in user_fun:
            if len({fun_name}.difference(main_set)) != 0 and len({fun_name}.difference(keywords)) != 0:
                # check to see if function name already in dictionary
                if fun_name not in fun_symbols.keys():
                    fun_symbols[fun_name] = 'FUN' + str(fun_count)
                    fun_count += 1
                # ensure that only function name gets replaced (no variable name with same
                # identifier); uses positive lookforward
                hex_line = re.sub(r'\b(' + fun_name + r')\b(?=\s*\()', fun_symbols[fun_name], hex_line)

        for var_name in user_var:
            # next line is the nuanced difference between fun_name and var_name
            if len({var_name[0]}.difference(keywords)) != 0 and len({var_name[0]}.difference(main_args)) != 0:
                # check to see if variable name already in dictionary
                if var_name[0] not in var_symbols.keys():
                    var_symbols[var_name[0]] = 'VAR' + str(var_count)
                    var_count += 1
                # ensure that only variable name gets replaced (no function name with same
                # identifier); uses negative lookforward
                # print(var_name, gadget, user_var)
                hex_line = re.sub(r'\b(' + var_name[0] + r')\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()',
                                  var_symbols[var_name[0]], hex_line)

        cleaned_gadget.append(hex_line)
    # return the list of cleaned lines
    return cleaned_gadget


# Cleaner & Tokenizer
# Author https://github.com/hazimhanif/svd-transformer/blob/master/transformer_svd.ipynb

def tokenizer(code, flag=False):
    gadget: List[str] = []
    tokenized: List[str] = []
    # remove all string literals
    no_str_lit_line = re.sub(r'["]([^"\\\n]|\\.|\\\n)*["]', '', code)
    # remove all character literals
    no_char_lit_line = re.sub(r"'.*?'", "", no_str_lit_line)
    code = no_char_lit_line

    if flag:
        code = codecs.getdecoder("unicode_escape")(no_char_lit_line)[0]

    for line in code.splitlines():
        if line == '':
            continue
        stripped = line.strip()
        # if "\\n\\n" in stripped: print(stripped)
        gadget.append(stripped)

    clean = clean_gadget(gadget)

    for cg in clean:
        if cg == '':
            continue

        # Remove code comments
        pat = re.compile(r'(/\*([^*]|(\*+[^*\/]))*\*+\/)|(\/\/.*)')
        cg = re.sub(pat, '', cg)

        # Remove newlines & tabs
        cg = re.sub('(\n)|(\\\\n)|(\\\\)|(\\t)|(\\r)', '', cg)
        # Mix split (characters and words)
        splitter = r' +|' + regex_split_operators + r'|(\/)|(\;)|(\-)|(\*)'
        cg = re.split(splitter, cg)

        # Remove None type
        cg = list(filter(None, cg))
        cg = list(filter(str.strip, cg))
        # code = " ".join(code)
        # Return list of tokens
        tokenized.extend(cg)

    return tokenized

In [6]:
pip install torch_geometric

Collecting torch_geometric
  Using cached torch_geometric-2.5.0-py3-none-any.whl.metadata (64 kB)
Collecting aiohttp (from torch_geometric)
  Using cached aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->torch_geometric)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->torch_geometric)
  Using cached frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->torch_geometric)
  Using cached multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->torch_geometric)
  Using cached yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting async-timeout<5.0,>=4.0 (from aiohttp->torch_geometric)
  Using cached async_t

In [14]:
from torch.utils.data import Dataset as TorchDataset
# from torch_geometric.data import DataLoader
from torch_geometric.loader import DataLoader

class InputDataset(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        # return self.dataset.iloc[index].input
        data = self.dataset.iloc[index].input
        data.func = self.dataset.iloc[index].func  # 添加现有的func属性, 为 codebert 的输入准备
        return data

    def get_loader(self, batch_size, shuffle=True):
        return DataLoader(dataset=self, batch_size=batch_size, shuffle=shuffle)

In [15]:
def read(path, json_file):
  """
  :param path: str
  :param json_file: str
  :return DataFrame
  """
  return pd.read_json(path + json_file)


def get_ratio(dataset, ratio):
    approx_size = int(len(dataset) * ratio)
    return dataset[:approx_size]


def load(path, pickle_file, ratio=1):
    dataset = pd.read_pickle(path + pickle_file)
    dataset.info(memory_usage='deep')
    if ratio < 1:
        dataset = get_ratio(dataset, ratio)

    return dataset


def write(data_frame: pd.DataFrame, path, file_name):
    data_frame.to_pickle(path + file_name)


def apply_filter(data_frame: pd.DataFrame, filter_func):
    return filter_func(data_frame)


def rename(data_frame: pd.DataFrame, old, new):
    return data_frame.rename(columns={old: new})


def tokenize(data_frame: pd.DataFrame):
    data_frame["tokens"] = data_frame["func"].apply(tokenizer)
    # Change column name
    # data_frame.rename(columns={"func": "tokens"}, inplace=True)
    # Keep just the tokens
    return data_frame[["tokens", "func"]]

def to_files(data_frame: pd.DataFrame, out_path):
    # path = f"{self.out_path}/{self.dataset_name}/"
    if os.path.exists(out_path):
        return
    os.makedirs(out_path)

    for idx, row in data_frame.iterrows():
        file_name = f"{idx}.c"
        with open(out_path + file_name, 'w') as f:
            f.write(row.func)


def create_with_index(data, columns):
    data_frame = pd.DataFrame(data, columns=columns)
    data_frame.index = list(data_frame["Index"])

    return data_frame


def inner_join_by_index(df1, df2):
    return pd.merge(df1, df2, left_index=True, right_index=True)


def train_val_test_split(data_frame: pd.DataFrame, shuffle=True):
    print("Splitting Dataset")

    false = data_frame[data_frame.target == 0]
    true = data_frame[data_frame.target == 1]

    train_false, test_false = train_test_split(false, test_size=0.2, shuffle=shuffle)
    test_false, val_false = train_test_split(test_false, test_size=0.5, shuffle=shuffle)
    train_true, test_true = train_test_split(true, test_size=0.2, shuffle=shuffle)
    test_true, val_true = train_test_split(test_true, test_size=0.5, shuffle=shuffle)

    # train = train_false.append(train_true)
    train = pd.concat([train_false, train_true])

    # val = val_false.append(val_true)
    val = pd.concat([val_false, val_true])

    # test = test_false.append(test_true)
    test =pd.concat([test_false, test_true])

    train = train.reset_index(drop=True)
    val = val.reset_index(drop=True)
    test = test.reset_index(drop=True)

    return InputDataset(train), InputDataset(test), InputDataset(val)


def get_directory_files(directory):
    return [os.path.basename(file) for file in glob.glob(f"{directory}/*.pkl")]


def loads(data_sets_dir, ratio=1):
    data_sets_files = sorted([f for f in listdir(data_sets_dir) if isfile(join(data_sets_dir, f))])

    if ratio < 1:
        data_sets_files = get_ratio(data_sets_files, ratio)

    dataset = load(data_sets_dir, data_sets_files[0])
    data_sets_files.remove(data_sets_files[0])

    for ds_file in data_sets_files:
        #dataset = dataset.append(load(data_sets_dir, ds_file))
        # 使用 pd.concat 来连接两个 DataFrame
        dataset = pd.concat([dataset, load(data_sets_dir, ds_file)])


    return dataset


def clean(data_frame: pd.DataFrame):
    return data_frame.drop_duplicates(subset="func", keep=False)


def drop(data_frame: pd.DataFrame, keys):
    for key in keys:
        del data_frame[key]


def slice_frame(data_frame: pd.DataFrame, size: int):
    data_frame_size = len(data_frame)
    return data_frame.groupby(np.arange(data_frame_size) // size)


In [19]:
def graph_indexing(graph):
    idx = int(graph["file"].split(".c")[0].split("/")[-1])
    del graph["file"]
    return idx, {"functions": [graph]}

def joern_parse(joern_path, input_path, output_path, file_name):
    out_file = file_name + ".bin"

    # 使用 os.path.join 构建绝对路径
    joern_parse_cmd = [os.path.join(joern_path, "joern-parse"), input_path, "--out",
                       os.path.join(output_path, out_file)]

    try:
        # 使用 check=True 处理异常
        joern_parse_call = subprocess.run(joern_parse_cmd, stdout=subprocess.PIPE, text=True, check=True)
        print(str(joern_parse_call))
        return out_file
    except subprocess.CalledProcessError as e:
        print(f"Error while running joern-parse: {e}")
        # 可以根据需要执行其他错误处理操作
        return None


def joern_create(joern_path, in_path, out_path, cpg_files):
    joern_process = subprocess.Popen(["./" + joern_path + "joern"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    json_files = []
    for cpg_file in cpg_files:
        json_file_name = f"{cpg_file.split('.')[0]}.json"
        json_files.append(json_file_name)

        print(in_path+cpg_file)
        if os.path.exists(in_path+cpg_file):
            json_out = f"{os.path.abspath(out_path)}/{json_file_name}"
            import_cpg_cmd = f"importCpg(\"{os.path.abspath(in_path)}/{cpg_file}\")\r".encode()
            script_path = f"{os.path.dirname(os.path.abspath(joern_path))}/graph-for-funcs.sc"
            run_script_cmd = f"cpg.runScript(\"{script_path}\").toString() |> \"{json_out}\"\r".encode()
            joern_process.stdin.write(import_cpg_cmd)
            print(joern_process.stdout.readline().decode())
            joern_process.stdin.write(run_script_cmd)
            print(joern_process.stdout.readline().decode())
            joern_process.stdin.write("delete\r".encode())
            print(joern_process.stdout.readline().decode())
    try:
        outs, errs = joern_process.communicate(timeout=60)
    except subprocess.TimeoutExpired:
        joern_process.kill()
        outs, errs = joern_process.communicate()
    if outs is not None:
        print(f"Outs: {outs.decode()}")
    if errs is not None:
        print(f"Errs: {errs.decode()}")
    return json_files


def json_process(in_path, json_file):
    if os.path.exists(in_path+json_file):
        with open(in_path+json_file) as jf:
            cpg_string = jf.read()
            cpg_string = re.sub(r"io\.shiftleft\.codepropertygraph\.generated\.", '', cpg_string)
            cpg_json = json.loads(cpg_string)
            container = [graph_indexing(graph) for graph in cpg_json["functions"] if graph["file"] != "N/A"]
            return container
    return None


In [20]:
import logging.config

FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'

logging.basicConfig(filename="logs.log",
                    filemode='a',
                    format=FORMAT,
                    datefmt='%d/%m/%Y %I:%M:%S')


def log_info(logger_name, msg):
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    logger.info(msg)

def log_warning(logger_name, msg):
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.WARNING)
    logger.warning(msg)

In [23]:
pip install gensim==3.8.1

Collecting gensim==3.8.1
  Using cached gensim-3.8.1-cp310-cp310-linux_x86_64.whl
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.2
    Uninstalling gensim-4.3.2:
      Successfully uninstalled gensim-4.3.2
Successfully installed gensim-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [33]:
pip install --upgrade gensim

Collecting gensim
  Using cached gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Using cached gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.1
    Uninstalling gensim-3.8.1:
      Successfully uninstalled gensim-3.8.1
Successfully installed gensim-4.3.2
Note: you may need to restart the kernel to use updated packages.


In [34]:
from collections.abc import Iterable
from gensim.models.keyedvectors import Word2VecKeyedVectors
from torch_geometric.data import Data

class NodesEmbedding:
    def __init__(self, nodes_dim: int, w2v_keyed_vectors: Word2VecKeyedVectors):
        self.w2v_keyed_vectors = w2v_keyed_vectors
        self.kv_size = w2v_keyed_vectors.vector_size
        self.nodes_dim = nodes_dim

        assert self.nodes_dim >= 0

        # Buffer for embeddings with padding
        self.target = torch.zeros(self.nodes_dim, self.kv_size + 1).float()

    def __call__(self, nodes):
        embedded_nodes = self.embed_nodes(nodes)
        nodes_tensor = torch.from_numpy(embedded_nodes).float()

        self.target[:nodes_tensor.size(0), :] = nodes_tensor

        return self.target

    def embed_nodes(self, nodes):
        embeddings = []

        for n_id, node in nodes.items():
            # Get node's code
            node_code = node.get_code()
            # Tokenize the code
            tokenized_code = tokenizer(node_code, True)
            if not tokenized_code:
                # print(f"Dropped node {node}: tokenized code is empty.")
                msg = f"Empty TOKENIZED from node CODE {node_code}"
                log_warning('embeddings', msg)
                continue
            # Get each token's learned embedding vector
            vectorized_code = np.array(self.get_vectors(tokenized_code, node))
            # The node's source embedding is the average of it's embedded tokens
            source_embedding = np.mean(vectorized_code, 0)
            # The node representation is the concatenation of label and source embeddings
            embedding = np.concatenate((np.array([node.type]), source_embedding), axis=0)
            embeddings.append(embedding)
        # print(node.label, node.properties.properties.get("METHOD_FULL_NAME"))

        return np.array(embeddings)

    # fromTokenToVectors
    def get_vectors(self, tokenized_code, node):
        vectors = []

        for token in tokenized_code:
            if token in self.w2v_keyed_vectors.vocab:
                vectors.append(self.w2v_keyed_vectors[token])
            else:
                # print(node.label, token, node.get_code(), tokenized_code)
                vectors.append(np.zeros(self.kv_size))
                if node.label not in ["Identifier", "Literal", "MethodParameterIn", "MethodParameterOut"]:
                    msg = f"No vector for TOKEN {token} in {node.get_code()}."
                    log_warning('embeddings', msg)

        return vectors


class GraphsEmbedding:
    def __init__(self, edge_type):
        self.edge_type = edge_type

    def __call__(self, nodes):
        connections = self.nodes_connectivity(nodes)

        return torch.tensor(connections).long()

    # nodesToGraphConnectivity
    def nodes_connectivity(self, nodes):
        # nodes are ordered by line and column
        coo = [[], []]

        for node_idx, (node_id, node) in enumerate(nodes.items()):
            if node_idx != node.order:
                raise Exception("Something wrong with the order")

            for e_id, edge in node.edges.items():
                if edge.type != self.edge_type:
                    continue

                if edge.node_in in nodes and edge.node_in != node_id:
                    coo[0].append(nodes[edge.node_in].order)
                    coo[1].append(node_idx)

                if edge.node_out in nodes and edge.node_out != node_id:
                    coo[0].append(node_idx)
                    coo[1].append(nodes[edge.node_out].order)

        return coo

def nodes_to_input(nodes, target, nodes_dim, keyed_vectors, edge_type):
    nodes_embedding = NodesEmbedding(nodes_dim, keyed_vectors)
    graphs_embedding = GraphsEmbedding(edge_type)
    label = torch.tensor([target]).float()

    return Data(x=nodes_embedding(nodes), edge_index=graphs_embedding(nodes), y=label)


In [35]:
class Properties:
	def __init__(self, props, indentation):
		self.size = len(props)
		self.indentation = indentation + 1
		self.pairs = {prop["key"]: prop["value"] for prop in props}

	def __str__(self):
		indentation = self.indentation * "\t"
		string = ""

		for prop in self.pairs:
			string += f"\n{indentation}Property - {prop} : {self.pairs[prop]}"

		return f"{indentation}{string}\n"

	def code(self):
		if self.has_code():
			code = self.pairs["CODE"]
			if self.has_type() and self.get_type() != "ANY" and self.get_type() not in code:
				code = f"{self.get_type()} {code}"
			return code
		return None

	def get_type(self):
		return self.pairs.get("TYPE_FULL_NAME")

	def has_type(self):
		return "TYPE_FULL_NAME" in self.pairs

	def has_code(self):
		return "CODE" in self.pairs

	def line_number(self):
		return self.pairs["LINE_NUMBER"] if self.has_line_number() else None

	def has_line_number(self):
		return "LINE_NUMBER" in self.pairs

	def column_number(self):
		return self.pairs["COLUMN_NUMBER"] if self.has_column_number() else None

	def has_column_number(self):
		return "COLUMN_NUMBER" in self.pairs

	def get(self):
		return self.pairs

	def get_operator(self):
		value = self.pairs.get("METHOD_FULL_NAME")
		if value is None:
			return value
		if ("<operator>" in value) or ("<operators>" in value):
			return value.split(".")[-1]
		return None

class Edge:
	def __init__(self, edge, indentation):
		self.id = edge["id"].split(".")[-1]
		self.type = self.id.split("@")[0]
		self.node_in = edge["in"].split(".")[-1]
		self.node_out = edge["out"].split(".")[-1]
		self.indentation = indentation + 1

	def __str__(self):
		indentation = self.indentation * "\t"
		return f"\n{indentation}Edge id: {self.id}\n{indentation}Node in: {self.node_in}\n{indentation}Node out: {self.node_out}\n"


node_labels = ["Block", "Call", "Comment", "ControlStructure", "File", "Identifier", "FieldIdentifier", "Literal",
               "Local", "Member", "MetaData", "Method", "MethodInst", "MethodParameterIn", "MethodParameterOut",
               "MethodReturn", "Namespace", "NamespaceBlock", "Return", "Type", "TypeDecl", "Unknown"]

operators = ['addition', 'addressOf', 'and', 'arithmeticShiftRight', 'assignment',
             'assignmentAnd', 'assignmentArithmeticShiftRight', 'assignmentDivision',
             'assignmentMinus', 'assignmentMultiplication', 'assignmentOr', 'assignmentPlus',
             'assignmentShiftLeft', 'assignmentXor', 'cast', 'conditionalExpression',
             'division', 'equals', 'fieldAccess', 'greaterEqualsThan', 'greaterThan',
             'indirectFieldAccess', 'indirectIndexAccess', 'indirection', 'lessEqualsThan',
             'lessThan', 'logicalAnd', 'logicalNot', 'logicalOr', 'minus', 'modulo', 'multiplication',
             'not', 'notEquals', 'or', 'postDecrement', 'plus', 'postIncrement', 'preDecrement',
             'preIncrement', 'shiftLeft', 'sizeOf', 'subtraction']

node_labels += operators

node_labels = {label: i for i, label in enumerate(node_labels)}

PRINT_PROPS = True


class Node:
    def __init__(self, node, indentation):
        self.id = node["id"].split(".")[-1]
        self.label = self.id.split("@")[0]
        self.indentation = indentation + 1
        self.properties = Properties(node["properties"], self.indentation)
        self.edges = {edge["id"].split(".")[-1]: Edge(edge, self.indentation) for edge in node["edges"]}
        self.order = None
        operator = self.properties.get_operator()
        self.label = operator if operator is not None else self.label
        self._set_type()

    def __str__(self):
        indentation = self.indentation * "\t"
        properties = f"{indentation}Properties: {self.properties}\n"
        edges_str = ""

        for edge in self.edges:
            edges_str += f"{self.edges[edge]}"

        return f"\n{indentation}Node id: {self.id}\n{properties if PRINT_PROPS else ''}{indentation}Edges: {edges_str}"

    def connections(self, connections, e_type):
        for e_id, edge in self.edges.items():
            if edge.type != e_type: continue

            if edge.node_in in connections["in"] and edge.node_in != self.id:
                connections["in"][self.id] = edge.node_in

            if edge.node_out in connections["out"] and edge.node_out != self.id:
                connections["out"][self.id] = edge.node_out

        return connections

    def has_code(self):
        return self.properties.has_code()

    def has_line_number(self):
        return self.properties.has_line_number()

    def get_code(self):
        return self.properties.code()

    def get_line_number(self):
        return self.properties.line_number()

    def get_column_number(self):
        return self.properties.column_number()

    def _set_type(self):
        # label = self.label if self.operator is None else self.operator
        self.type = node_labels.get(self.label)  # Label embedding

        if self.type is None:
            log_warning("node", f"LABEL {self.label} not in labels!")
            self.type = len(node_labels) + 1

class AST:
    def __init__(self, nodes, indentation):
        self.size = len(nodes)
        self.indentation = indentation + 1
        self.nodes = {node["id"].split(".")[-1]: Node(node, self.indentation) for node in nodes}

    def __str__(self):
        indentation = self.indentation * "\t"
        nodes_str = ""

        for node in self.nodes:
            nodes_str += f"{indentation}{self.nodes[node]}"

        return f"\n{indentation}Size: {self.size}\n{indentation}Nodes:{nodes_str}"

    def get_nodes_type(self):
        return {n_id: node.type for n_id, node in self.nodes.items()}

class Function:
    def __init__(self, function):
        self.name = function["function"]
        self.id = function["id"].split(".")[-1]
        self.indentation = 1
        self.ast = AST(function["AST"], self.indentation)

    def __str__(self):
        indentation = self.indentation * "\t"
        return f"{indentation}Function Name: {self.name}\n{indentation}Id: {self.id}\n{indentation}AST:{self.ast}"

    def get_nodes(self):
        return self.ast.nodes

    def get_nodes_types(self):
        return self.ast.get_nodes_type()

In [36]:
from collections import OrderedDict


def order_nodes(nodes, max_nodes):
    # sorts nodes by line and column

    nodes_by_column = sorted(nodes.items(), key=lambda n: n[1].get_column_number())
    nodes_by_line = sorted(nodes_by_column, key=lambda n: n[1].get_line_number())

    for i, node in enumerate(nodes_by_line):
        node[1].order = i

    if len(nodes) > max_nodes:
        print(f"CPG cut - original nodes: {len(nodes)} to max: {max_nodes}")
        return OrderedDict(nodes_by_line[:max_nodes])

    return OrderedDict(nodes_by_line)


def filter_nodes(nodes):
    return {n_id: node for n_id, node in nodes.items() if node.has_code() and
            node.has_line_number() and
            node.label not in ["Comment", "Unknown"]}

def parse_to_nodes(cpg, max_nodes=500):
    nodes = {}
    for function in cpg["functions"]:
        func = Function(function)
        # Only nodes with code and line number are selected
        filtered_nodes = filter_nodes(func.get_nodes())
        nodes.update(filtered_nodes)

    return order_nodes(nodes, max_nodes)

In [46]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn.conv import GatedGraphConv
import torch
torch.manual_seed(2020)
import dataclasses
from dataclasses import dataclass
from typing import List

def get_conv_mp_out_size(in_size, last_layer, mps):
    size = in_size

    for mp in mps:
        size = round((size - mp["kernel_size"]) / mp["stride"] + 1)

    size = size + 1 if size % 2 != 0 else size

    return int(size * last_layer["out_channels"])


def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv1d:
        torch.nn.init.xavier_uniform_(m.weight)


class Conv(nn.Module):

    def __init__(self, conv1d_1, conv1d_2, maxpool1d_1, maxpool1d_2, fc_1_size, fc_2_size):
        super(Conv, self).__init__()
        self.conv1d_1_args = conv1d_1
        self.conv1d_1 = nn.Conv1d(**conv1d_1)
        self.conv1d_2 = nn.Conv1d(**conv1d_2)

        fc1_size = get_conv_mp_out_size(fc_1_size, conv1d_2, [maxpool1d_1, maxpool1d_2])
        fc2_size = get_conv_mp_out_size(fc_2_size, conv1d_2, [maxpool1d_1, maxpool1d_2])

        # Dense layers
        self.fc1 = nn.Linear(fc1_size, 1)
        self.fc2 = nn.Linear(fc2_size, 1)

        # Dropout
        self.drop = nn.Dropout(p=0.2)

        self.mp_1 = nn.MaxPool1d(**maxpool1d_1)
        self.mp_2 = nn.MaxPool1d(**maxpool1d_2)

    def forward(self, hidden, x):
        concat = torch.cat([hidden, x], 1)
        concat_size = hidden.shape[1] + x.shape[1]
        concat = concat.view(-1, self.conv1d_1_args["in_channels"], concat_size)

        Z = self.mp_1(F.relu(self.conv1d_1(concat)))
        Z = self.mp_2(self.conv1d_2(Z))

        hidden = hidden.view(-1, self.conv1d_1_args["in_channels"], hidden.shape[1])

        Y = self.mp_1(F.relu(self.conv1d_1(hidden)))
        Y = self.mp_2(self.conv1d_2(Y))

        Z_flatten_size = int(Z.shape[1] * Z.shape[-1])
        Y_flatten_size = int(Y.shape[1] * Y.shape[-1])

        Z = Z.view(-1, Z_flatten_size)
        Y = Y.view(-1, Y_flatten_size)
        res = self.fc1(Z) * self.fc2(Y)
        res = self.drop(res)
        # res = res.mean(1)
        # print(res, mean)
        sig = torch.sigmoid(torch.flatten(res))
        return sig

def encode_input(text, tokenizer):
    max_length = 512
    input = tokenizer(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
#     print(input.keys())
    return input.input_ids, input.attention_mask
    

class BertGGCN(nn.Module):

    def __init__(self, gated_graph_conv_args, conv_args, emb_size, device):
        super(BertGGCN, self).__init__()
        self.k = 0.3
        self.ggc = GatedGraphConv(**gated_graph_conv_args).to(device)
        self.conv = Conv(**conv_args,
                         fc_1_size=gated_graph_conv_args["out_channels"] + emb_size,
                         fc_2_size=gated_graph_conv_args["out_channels"]).to(device)
        self.nb_class = 2
        # self.conv.apply(init_weights)

    def forward(self, data):
        # the DataLoader format
        # DataBatch(x=[1640, 101], edge_index=[2, 933], y=[8], func=[8], batch=[1640], ptr=[9])
        
        x, edge_index = data.x, data.edge_index
        x = self.ggc(x, edge_index)
        x = self.conv(x, data.x)

        return x

    def save(self, path):
        print(path)
        torch.save(self.state_dict(), path)
        print("save!!!!!!")

    def load(self, path):
        self.load_state_dict(torch.load(path))

def softmax_accuracy(probs, all_labels):
    acc = (torch.argmax(probs) == all_labels).sum()
    acc = torch.div(acc, len(all_labels) + 0.0)
    return acc

class Stat:
    def __init__(self, outs=None, loss=0.0, acc=0.0, labels=None):
        if labels is None:
            labels = []
        if outs is None:
            outs = []
        self.outs = outs
        self.labels = labels
        self.loss = loss
        self.acc = acc

    def __add__(self, other):
        return Stat(self.outs + other.outs, self.loss + other.loss, self.acc + other.acc, self.labels + other.labels)

    def __str__(self):
        return f"Loss: {round(self.loss, 4)}; Acc: {round(self.acc, 4)};"


@dataclass
class Stats:
    name: str
    results: List[Stat] = dataclasses.field(default_factory=list)
    total: Stat = Stat()

    def __call__(self, stat):
        self.total += stat
        self.results.append(stat)

    def __str__(self):
        return f"{self.name} {self.mean()}"

    def __len__(self):
        return len(self.results)

    def mean(self):
        res = Stat()
        res += self.total
        res.loss /= len(self)
        res.acc /= len(self)

        return res

    def loss(self):
        return self.mean().loss

    def acc(self):
        return self.mean().acc

    def outs(self):
        return self.total.outs

    def labels(self):
        return self.total.labels


class Step:
    # Performs a step on the loader and returns the result
    def __init__(self, model, loss_function, optimizer):
        self.model = model
        self.criterion = loss_function
        self.optimizer = optimizer

    def __call__(self, i, x, y):
        out = self.model(x)
        # y= y.type(torch.LongTensor)
        loss = self.criterion(out, y.float())
        acc = softmax_accuracy(out, y.float())

        if self.model.training:
            # calculates the gradient
            loss.backward()
            # and performs a parameter update based on it
            self.optimizer.step()
            # clears old gradients from the last step
            self.optimizer.zero_grad()

        # print(f"\tBatch: {i}; Loss: {round(loss.item(), 4)}", end="")
        return Stat(out.tolist(), loss.item(), acc.item(), y.tolist())

    def train(self):
        self.model.train()

    def eval(self):
        self.model.eval()

class LoaderStep:
    def __init__(self, name, data_loader, device):
        self.name = name
        self.loader = data_loader
        self.size = len(data_loader)
        self.device = device

    def __call__(self, step):
        self.stats = Stats(self.name)

        for i, batch in enumerate(self.loader):
            batch.to(self.device)
            stat: Stat = step(i, batch, batch.y)
            self.stats(stat)

        return self.stats

class Devign(Step):
    def __init__(self,
                 path: str,
                 device: str,
                 model: dict,
                 learning_rate: float,
                 weight_decay: float,
                 loss_lambda: float):
        self.path = path
        self.lr = learning_rate
        self.wd = weight_decay
        self.ll = loss_lambda
        log_info('devign', f"LR: {self.lr}; WD: {self.wd}; LL: {self.ll};")
        _model = BertGGCN(**model, device=device)
        super().__init__(model=_model,
                         loss_function=lambda o, t: F.binary_cross_entropy(o, t) + F.l1_loss(o, t) * self.ll,
                         optimizer=optim.Adam(_model.parameters(), lr=self.lr, weight_decay=self.wd),
                         )


        self.count_parameters()

    def load(self):
        self.model.load(self.path)

    def save(self):
        self.model.save(self.path)

    def count_parameters(self):
        count = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        print(f"The model has {count:,} trainable parameters")


In [38]:
import time
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn import metrics

class Metrics:
    def __init__(self, outs, labels):
        self.scores = outs
        self.labels = labels
        self.transform()
        print(self.predicts)

    def transform(self):
        self.series = pd.Series(self.scores)
        self.predicts = self.series.apply(lambda x: 1 if x >= 0.5 else 0)
        self.predicts.reset_index(drop=True, inplace=True)

    def __str__(self):
        confusion = confusion_matrix(y_true=self.labels, y_pred=self.predicts)
        tn, fp, fn, tp = confusion.ravel()
        string = f"\nConfusion matrix: \n"
        string += f"{confusion}\n"
        string += f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}\n"
        string += '\n'.join([name + ": " + str(metric) for name, metric in self().items()])
        return string

    def __call__(self):
        _metrics = {"Accuracy": metrics.accuracy_score(y_true=self.labels, y_pred=self.predicts),
                    "Precision": metrics.precision_score(y_true=self.labels, y_pred=self.predicts),
                    "Recall": metrics.recall_score(y_true=self.labels, y_pred=self.predicts),
                    "F-measure": metrics.f1_score(y_true=self.labels, y_pred=self.predicts),
                    "Precision-Recall AUC": metrics.average_precision_score(y_true=self.labels, y_score=self.scores),
                    "AUC": metrics.roc_auc_score(y_true=self.labels, y_score=self.scores),
                    "MCC": metrics.matthews_corrcoef(y_true=self.labels, y_pred=self.predicts),
                    "Error": self.error()}

        return _metrics

    def log(self):
        excluded = ["Precision-Recall AUC", "AUC"]
        _metrics = self()
        rounded_metrics = {name: torch.round(torch.tensor(metric) * 1000) / 1000 for name, metric in _metrics.items() if
                           name not in excluded}
        msg = ' - '.join([f"({name[:3]} {round(metric.item(), 3)})" for name, metric in rounded_metrics.items()])

        # logger.log_info('metrics', msg)

    def error(self):
        errors = [(abs(score - (1 if score >= 0.5 else 0))/score)*100 for score, label in zip(self.scores, self.labels)]

        return sum(errors)/len(errors)

class Train(object):
    def __init__(self, step, epochs, verbose=True):
        self.epochs = epochs
        self.step = step
        self.history = History()
        self.verbose = verbose

    def __call__(self, train_loader_step, val_loader_step=None, early_stopping=None):
        for epoch in range(self.epochs):
            self.step.train()
            train_stats = train_loader_step(self.step)
            self.history(train_stats, epoch + 1)

            if val_loader_step is not None:
                with torch.no_grad():
                    self.step.eval()
                    val_stats = val_loader_step(self.step)
                    self.history(val_stats, epoch + 1)

                print(self.history)

                if early_stopping is not None:
                    valid_loss = val_stats.loss()
                    # early_stopping needs the validation loss to check if it has decreased,
                    # and if it has, it will make a checkpoint of the current model
                    if early_stopping(valid_loss):
                        self.history.log()
                        return
            else:
                print(self.history)
        self.history.log()


def predict(step, test_loader_step):
    print(f"Testing")
    with torch.no_grad():
        step.eval()
        stats = test_loader_step(step)
        metrics = Metrics(stats.outs(), stats.labels())
        print(metrics)
        metrics.log()
    return metrics()["Accuracy"]


class History:
    def __init__(self):
        self.history = {}
        self.epoch = 0
        self.timer = time.time()

    def __call__(self, stats, epoch):
        self.epoch = epoch

        if epoch in self.history:
            self.history[epoch].append(stats)
        else:
            self.history[epoch] = [stats]

    def __str__(self):
        epoch = f"\nEpoch {self.epoch};"
        stats = ' - '.join([f"{res}" for res in self.current()])
        timer = f"Time: {(time.time() - self.timer)}"

        return f"{epoch} - {stats} - {timer}"

    def current(self):
        return self.history[self.epoch]

    def log(self):
        msg = f"(Epoch: {self.epoch}) {' - '.join([f'({res})' for res in self.current()])}"
        log_info("history", msg)


In [39]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""

    def __init__(self, model, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.model = model

    def __call__(self, val_loss):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
                print("Early stopping")
        else:
            self.best_score = score
            self.save_checkpoint(val_loss)
            self.counter = 0

        return self.early_stop

    def save_checkpoint(self, val_loss):
        """
            Saves model when validation loss decrease.
        """
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        self.model.save()
        self.val_loss_min = val_loss

In [43]:
import subprocess
import os.path
from gensim.models.word2vec import Word2Vec

def embed_task():
    context = Embed()
    # Tokenize source code into tokens
    dataset_files = get_directory_files(PATHS.cpg)
    w2vmodel = Word2Vec(**context.w2v_args)
    w2v_init = True
    for pkl_file in dataset_files:
        file_name = pkl_file.split(".")[0]
        cpg_dataset = load(PATHS.cpg, pkl_file)
        tokens_dataset = tokenize(cpg_dataset)
        write(tokens_dataset, PATHS.tokens, f"{file_name}_{FILES.tokens}")
        # word2vec used to learn the initial embedding of each token
        w2vmodel.build_vocab(tokens_dataset.tokens, update=not w2v_init)
        w2vmodel.train(tokens_dataset.tokens, total_examples=w2vmodel.corpus_count, epochs=1)
        if w2v_init:
            w2v_init = False

        # cpg_dataset = load(PATHS.cpg, pkl_file)
        # Embed cpg to node representation and pass to graph data structure
        cpg_dataset["nodes"] = cpg_dataset.apply(lambda row: parse_to_nodes(row.cpg, context.nodes_dim), axis=1)
        # remove rows with no nodes
        # Use key_to_index instead of vocab
        cpg_dataset["input"] = cpg_dataset.apply(lambda row: nodes_to_input(row.nodes, row.target, context.nodes_dim,
                                  w2vmodel.wv, context.edge_type), axis=1)

        drop(cpg_dataset, ["nodes"])
        print(f"Saving input dataset {file_name} with size {len(cpg_dataset)}.")
        # write(cpg_dataset[["input", "target"]], PATHS.input, f"{file_name}_{FILES.input}")

        # 为适应 codebert 的输入，我们需要额外的保存一行 func
        # write(cpg_dataset[["input", "target","func"]], PATHS.input, f"{file_name}_{FILES.input}")
        write(cpg_dataset[["input", "target", "func"]], PATHS.input, f"{file_name}_{FILES.input}")
        
        del cpg_dataset
        gc.collect()
    print("Saving w2vmodel.")
    # w2vmodel.save(f"{PATHS.w2v}/{FILES.w2v}")


def process_task(stopping):
    context = Process()
    devign = Devign_class()
    # model_path = PATHS.model + FILES.model

    model_path ="Codebert_ggnn_verison_1.pth"
    model = Devign(path=model_path, device=DEVICE, model=devign.model, learning_rate=devign.learning_rate,
                           weight_decay=devign.weight_decay,
                           loss_lambda=devign.loss_lambda)
    train = Train(model, context.epochs)
    input_dataset = loads(PATHS.input)
    # split the dataset and pass to DataLoader with batch size
    train_loader, val_loader, test_loader = list(
        map(lambda x: x.get_loader(context.batch_size, shuffle=context.shuffle),
            train_val_test_split(input_dataset, shuffle=context.shuffle)))
    train_loader_step = LoaderStep("Train", train_loader, DEVICE)
    val_loader_step = LoaderStep("Validation", val_loader, DEVICE)
    test_loader_step = LoaderStep("Test", test_loader, DEVICE)

    if stopping:
        early_stopping = EarlyStopping(model, patience=context.patience)
        train(train_loader_step, val_loader_step, early_stopping)
        model.load()
    else:
        train(train_loader_step, val_loader_step)
        print("Saving modeling !!!")
        model.save()

    predict(model, test_loader_step)

In [44]:
embed_task()

<class 'pandas.core.frame.DataFrame'>
Index: 298 entries, 900 to 1199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  298 non-null    int64 
 1   func    298 non-null    object
 2   Index   298 non-null    int64 
 3   cpg     298 non-null    object
dtypes: int64(2), object(2)
memory usage: 264.9 KB
CPG cut - original nodes: 237 to max: 205
CPG cut - original nodes: 212 to max: 205
Saving input dataset 3_cpg with size 298.
<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 2100 to 2399
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  297 non-null    int64 
 1   func    297 non-null    object
 2   Index   297 non-null    int64 
 3   cpg     297 non-null    object
dtypes: int64(2), object(2)
memory usage: 265.1 KB
Saving input dataset 7_cpg with size 297.
<class 'pandas.core.frame.DataFrame'>
Index: 294 entries, 2700 to 2999
Data columns (total 4 

In [47]:
process_task(True)

The model has 515,542 trainable parameters
<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 299
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   297 non-null    object
 1   target  297 non-null    int64 
 2   func    297 non-null    object
dtypes: int64(1), object(2)
memory usage: 214.5 KB
<class 'pandas.core.frame.DataFrame'>
Index: 301 entries, 3000 to 3299
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   301 non-null    object
 1   target  301 non-null    int64 
 2   func    301 non-null    object
dtypes: int64(1), object(2)
memory usage: 216.2 KB
<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 3300 to 3599
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   297 non-null    object
 1   target  297 non-null    int64 
 2   func    297 non-null    object
dtypes: in

  return F.conv1d(input, weight, bias, self.stride,



Epoch 1; - Train Loss: 0.6933; Acc: 0.1665; - Validation Loss: 0.6903; Acc: 0.1122; - Time: 6.619290828704834
Codebert_ggnn_verison_1.pth
save!!!!!!

Epoch 2; - Train Loss: 0.6912; Acc: 0.1285; - Validation Loss: 0.6934; Acc: 0.1199; - Time: 9.893531560897827
EarlyStopping counter: 1 out of 10

Epoch 3; - Train Loss: 0.6836; Acc: 0.1362; - Validation Loss: 0.6998; Acc: 0.1224; - Time: 13.108749866485596
EarlyStopping counter: 2 out of 10

Epoch 4; - Train Loss: 0.6616; Acc: 0.1344; - Validation Loss: 0.7412; Acc: 0.0612; - Time: 16.382174968719482
EarlyStopping counter: 3 out of 10

Epoch 5; - Train Loss: 0.6296; Acc: 0.1274; - Validation Loss: 0.7873; Acc: 0.1378; - Time: 19.71784734725952
EarlyStopping counter: 4 out of 10

Epoch 6; - Train Loss: 0.5965; Acc: 0.1315; - Validation Loss: 0.7699; Acc: 0.1531; - Time: 23.188239812850952
EarlyStopping counter: 5 out of 10

Epoch 7; - Train Loss: 0.5589; Acc: 0.1108; - Validation Loss: 0.8558; Acc: 0.0689; - Time: 26.71869921684265
EarlyS

In [19]:
process_task(True)

  return self.fget.__get__(instance, owner)()


The model has 125,162,712 trainable parameters
<class 'pandas.core.frame.DataFrame'>
Index: 497 entries, 2 to 3330
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   497 non-null    object
 1   target  497 non-null    int64 
 2   func    497 non-null    object
dtypes: int64(1), object(2)
memory usage: 345.9 KB
<class 'pandas.core.frame.DataFrame'>
Index: 497 entries, 3331 to 6789
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   497 non-null    object
 1   target  497 non-null    int64 
 2   func    497 non-null    object
dtypes: int64(1), object(2)
memory usage: 366.3 KB
Splitting Dataset

Epoch 1; - Train Loss: 0.9325; Acc: 0.16; - Validation Loss: 0.698; Acc: 0.0769; - Time: 3.0846588611602783
/hy-tmp/data/model/Codebert_ggnn_verison_1.pth
save!!!!!!

Epoch 2; - Train Loss: 0.6988; Acc: 0.18; - Validation Loss: 0.6947; Acc: 0.0769; - Time: 6.2476174831

## Details of embed_task

In [30]:
context = Embed()
# Tokenize source code into tokens
dataset_files = get_directory_files(PATHS.cpg)
w2vmodel = Word2Vec(**context.w2v_args)
w2v_init = True

In [31]:
dataset_files

['0_cpg.pkl', '1_cpg.pkl']

In [32]:
pkl_file = dataset_files[0]

def tokenize(data_frame: pd.DataFrame):
    data_frame.func = data_frame.func.apply(tokenizer)
    # Change column name
    data_frame = rename(data_frame, 'func', 'tokens')
    # Keep just the tokens
    return data_frame[["tokens"]]
    
file_name = pkl_file.split(".")[0]
cpg_dataset = load(PATHS.cpg, pkl_file)
tokens_dataset = tokenize(cpg_dataset)

<class 'pandas.core.frame.DataFrame'>
Index: 497 entries, 2 to 3330
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  497 non-null    int64 
 1   func    497 non-null    object
 2   Index   497 non-null    int64 
 3   cpg     497 non-null    object
dtypes: int64(2), object(2)
memory usage: 439.0 KB


In [33]:
cpg_dataset

Unnamed: 0,target,func,Index,cpg
2,0,"[static, void, FUN1, (, void, *, VAR1, ,, uint...",2,{'functions': [{'function': 'v4l2_free_buffer'...
4,0,"[int, FUN1, (, cl_mem, VAR1, ,, uint8_t, *, VA...",4,{'functions': [{'function': 'av_opencl_buffer_...
5,1,"[static, int, FUN1, (, AVFormatContext, *, VAR...",5,"{'functions': [{'function': 'r3d_read_rdvo', '..."
11,1,"[void, FUN1, (, AVDictionary, *, VAR1, ), {, A...",11,{'functions': [{'function': 'assert_avoptions'...
19,0,"[av_cold, void, FUN1, (, AVCodecContext, *, VA...",19,{'functions': [{'function': 'ff_af_queue_init'...
...,...,...,...,...
3315,0,"[static, int, FUN1, (, FTPContext, *, VAR1, ),...",3315,{'functions': [{'function': 'ftp_flush_control...
3323,1,"[int64_t, FUN1, (, int64_t, VAR1, ,, int64_t, ...",3323,"{'functions': [{'function': 'av_gcd', 'id': 'n..."
3324,1,"[static, inline, int, FUN1, (, APEContext, *, ...",3324,{'functions': [{'function': 'ape_decode_value_...
3325,1,"[int, FUN1, (, AVCodecContext, *, VAR1, ,, AVP...",3325,{'functions': [{'function': 'avcodec_decode_vi...


In [34]:
tokens_dataset

Unnamed: 0,tokens
2,"[static, void, FUN1, (, void, *, VAR1, ,, uint..."
4,"[int, FUN1, (, cl_mem, VAR1, ,, uint8_t, *, VA..."
5,"[static, int, FUN1, (, AVFormatContext, *, VAR..."
11,"[void, FUN1, (, AVDictionary, *, VAR1, ), {, A..."
19,"[av_cold, void, FUN1, (, AVCodecContext, *, VA..."
...,...
3315,"[static, int, FUN1, (, FTPContext, *, VAR1, ),..."
3323,"[int64_t, FUN1, (, int64_t, VAR1, ,, int64_t, ..."
3324,"[static, inline, int, FUN1, (, APEContext, *, ..."
3325,"[int, FUN1, (, AVCodecContext, *, VAR1, ,, AVP..."


In [37]:
w2vmodel.build_vocab(tokens_dataset.tokens, update=not w2v_init)
w2vmodel.train(tokens_dataset.tokens, total_examples=w2vmodel.corpus_count, epochs=1)
if w2v_init:
    w2v_init = False
# Embed cpg to node representation and pass to graph data structure
cpg_dataset["nodes"] = cpg_dataset.apply(lambda row: parse_to_nodes(row.cpg, context.nodes_dim), axis=1)
# remove rows with no nodes

CPG cut - original nodes: 264 to max: 205
CPG cut - original nodes: 263 to max: 205
CPG cut - original nodes: 236 to max: 205
CPG cut - original nodes: 275 to max: 205
CPG cut - original nodes: 215 to max: 205
CPG cut - original nodes: 215 to max: 205


In [40]:
# Use key_to_index instead of vocab
cpg_dataset["input"] = cpg_dataset.apply(lambda row: nodes_to_input(row.nodes, row.target, context.nodes_dim,
                          w2vmodel.wv, context.edge_type), axis=1)

In [41]:
cpg_dataset

Unnamed: 0,target,func,Index,cpg,nodes,input
0,0,"[FUN1, (, int, VAR1, ,, int, VAR2, ,, int, VAR...",0,"{'functions': [{'function': 'clear_area', 'id'...",{'MethodReturn[label=METHOD_RETURN; id=1808504...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
1,0,"[FUN1, (, VAR1, *, VAR2, ), {, VAR1, *, VAR3, ...",1,{'functions': [{'function': 'ReconstructDuList...,{'MethodReturn[label=METHOD_RETURN; id=2712756...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
2,0,"[FUN1, (, void, ), {, if, (, VAR1, ), FUN2, (,...",2,"{'functions': [{'function': 'free_speaker', 'i...",{'MethodReturn[label=METHOD_RETURN; id=1265953...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
3,0,"[FUN1, (, struct, mlx4_dev, *, VAR1, ), {, str...",3,{'functions': [{'function': 'mlx4_register_dev...,{'MethodReturn[label=METHOD_RETURN; id=2260630...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
4,1,"[FUN1, (, void, ), {, char, *, VAR1, =, FUN2, ...",4,"{'functions': [{'function': 'Parse_Env_Var', '...",{'MethodReturn[label=METHOD_RETURN; id=3255307...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
...,...,...,...,...,...,...
95,0,"[FUN1, (, tree, VAR1, ,, gimple, VAR2, ,, bool...",95,"{'functions': [{'function': 'create_access', '...",{'MethodReturn[label=METHOD_RETURN; id=8771245...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
96,0,"[FUN1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4, ,, ...",96,{'functions': [{'function': 'mailimap_body_ext...,{'MethodReturn[label=METHOD_RETURN; id=8861671...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
97,0,"[FUN1, (, struct, i2c_client, *, VAR1, ), {, u...",97,"{'functions': [{'function': 'i2c_read_le16', '...",{'MethodReturn[label=METHOD_RETURN; id=8952096...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."
98,1,"[FUN1, (, int, VAR1, ), {, char, VAR2, [, 20, ...",98,"{'functions': [{'function': 'draw_keys', 'id':...",{'MethodReturn[label=METHOD_RETURN; id=9042521...,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782..."


In [43]:
cpg_dataset[["input", "target","func"]]

Unnamed: 0,input,target,func
0,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, int, VAR1, ,, int, VAR2, ,, int, VAR..."
1,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, VAR1, *, VAR2, ), {, VAR1, *, VAR3, ..."
2,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, void, ), {, if, (, VAR1, ), FUN2, (,..."
3,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, struct, mlx4_dev, *, VAR1, ), {, str..."
4,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",1,"[FUN1, (, void, ), {, char, *, VAR1, =, FUN2, ..."
...,...,...,...
95,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, tree, VAR1, ,, gimple, VAR2, ,, bool..."
96,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4, ,, ..."
97,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",0,"[FUN1, (, struct, i2c_client, *, VAR1, ), {, u..."
98,"[(x, [tensor([ 6.6000e+01, 2.0672e-03, 9.782...",1,"[FUN1, (, int, VAR1, ), {, char, VAR2, [, 20, ..."


## Details of process_task

In [37]:
context = Process()
data_sets_dir = PATHS.input
data_sets_dir

'/hy-tmp/data/input/'

In [38]:
data_sets_files = sorted([f for f in listdir(data_sets_dir) if isfile(join(data_sets_dir, f))])

In [39]:
data_sets_files

['0_cpg_input.pkl', '1_cpg_input.pkl']

In [43]:
def load(path, pickle_file, ratio=1):
    dataset = pd.read_pickle(path + pickle_file)
    # dataset.info(memory_usage='deep')
    if ratio < 1:
        dataset = get_ratio(dataset, ratio)

    return dataset

In [42]:
dataset = load(data_sets_dir, data_sets_files[0])
data_sets_files.remove(data_sets_files[0])

<class 'pandas.core.frame.DataFrame'>
Index: 497 entries, 2 to 3330
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   497 non-null    object
 1   target  497 non-null    int64 
 2   func    497 non-null    object
dtypes: int64(1), object(2)
memory usage: 662.1 KB


In [45]:
# Data: 这表示该对象是一个 PyTorch Geometric 中的数据对象。
# x=[205, 101]: x 是数据对象的特征矩阵，它的形状是 [205, 101]，意味着有 205 个样本（节点）和每个样本有 101 个特征。
# edge_index=[2, 96]: edge_index 表示图的边索引，它的形状是 [2, 96]，表示图中有 96 条边，其中每条边由两个节点索引构成。
# y=[1]: y 是目标变量，它的形状是 [1]，表示有一个目标值。

dataset["input"][4]

Data(x=[205, 101], edge_index=[2, 128], y=[1])

In [46]:
type(dataset["input"][4])

torch_geometric.data.data.Data

In [47]:
# 相当于将所有的 data 从 pikle 中加载出来然后全部拼接起来
for ds_file in data_sets_files:
    #dataset = dataset.append(load(data_sets_dir, ds_file))
    # 使用 pd.concat 来连接两个 DataFrame
    dataset = pd.concat([dataset, load(data_sets_dir, ds_file)])

In [48]:
num_data = len(dataset)
print("数据集中的数据数量：", num_data)

数据集中的数据数量： 994


In [49]:
dataset

Unnamed: 0,input,target,func
2,"[(x, [tensor([ 1.5000e+01, 4.6916e-04, 1.431...",0,"[static, void, FUN1, (, void, *, VAR1, ,, uint..."
4,"[(x, [tensor([ 1.5000e+01, 2.6397e-04, -1.506...",0,"[int, FUN1, (, cl_mem, VAR1, ,, uint8_t, *, VA..."
5,"[(x, [tensor([ 1.5000e+01, -1.0787e-03, -6.084...",1,"[static, int, FUN1, (, AVFormatContext, *, VAR..."
11,"[(x, [tensor([ 1.5000e+01, 2.5857e-03, 7.315...",1,"[void, FUN1, (, AVDictionary, *, VAR1, ), {, A..."
19,"[(x, [tensor([ 1.5000e+01, 1.5140e-03, -9.458...",0,"[av_cold, void, FUN1, (, AVCodecContext, *, VA..."
...,...,...,...
6773,"[(x, [tensor([ 1.5000e+01, -1.3791e-03, -3.776...",1,"[int, FUN1, (, AVIOContext, *, VAR1, ,, unsign..."
6774,"[(x, [tensor([ 1.5000e+01, -2.2478e-03, -1.673...",1,"[static, int, FUN1, (, struct, playlist, *, VA..."
6776,"[(x, [tensor([ 1.5000e+01, 1.4559e-03, -7.945...",1,"[void, FUN1, (, unsigned, VAR1, ,, unsigned, *..."
6786,"[(x, [tensor([ 1.5000e+01, -3.5779e-04, 3.149...",1,"[static, void, FUN1, (, CinepakEncContext, *, ..."


In [50]:
type(dataset)

pandas.core.frame.DataFrame

In [51]:
# split the dataset and pass to DataLoader with batch size

train_loader, val_loader, test_loader = list(
        map(lambda x: x.get_loader(context.batch_size, shuffle=context.shuffle),
            train_val_test_split(dataset, shuffle=context.shuffle)))

Splitting Dataset


In [52]:
train_loader

<torch_geometric.loader.dataloader.DataLoader at 0x7f75b27f5dc0>

In [53]:
# x=[1640, 101]: x 是节点特征矩阵，形状为 [1640, 101]。其中，1640 是批次中的节点数量，101 是每个节点的特征维度。
# edge_index=[2, 933]: edge_index 是边索引矩阵，形状为 [2, 933]。其中，2 表示每列有两个元素，表示一条边的起始节点和目标节点的索引。933 是批次中的边数量。
# y=[8]: y 是标签，形状为 [8]。这里的 8 可能表示批次中的样本数量。
# batch=[1640]: batch 是用于指示节点属于哪个图批次的索引数组，形状为 [1640]。这里的 1640 可能表示批次中的节点数量。
# ptr=[9]: ptr 是一个指针数组，指示批次中每个图的节点范围，形状为 [9]。这里的 9 可能表示批次中的图数量


batch = next(iter(train_loader))  # 获取第一个批次
print(batch)

DataBatch(x=[1640, 101], edge_index=[2, 952], y=[8], func=[8], batch=[1640], ptr=[9])


In [54]:
false = dataset[dataset.target == 0]
true = dataset[dataset.target == 1]
shuffle = True

train_false, test_false = train_test_split(false, test_size=0.2, shuffle=shuffle)
test_false, val_false = train_test_split(test_false, test_size=0.5, shuffle=shuffle)
train_true, test_true = train_test_split(true, test_size=0.2, shuffle=shuffle)
test_true, val_true = train_test_split(test_true, test_size=0.5, shuffle=shuffle)

# train = train_false.append(train_true)
train = pd.concat([train_false, train_true])

# val = val_false.append(val_true)
val = pd.concat([val_false, val_true])

# test = test_false.append(test_true)
test =pd.concat([test_false, test_true])

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

### reset index and pack the dataset

In [41]:
train

Unnamed: 0,input,target,func
0,"[(x, [tensor([ 6.6000e+01, 1.9931e-03, 9.583...",0,"[FUN1, (, struct, histogram_s, *, VAR1, ,, flo..."
1,"[(x, [tensor([ 6.6000e+01, -2.6061e-03, 1.380...",0,"[FUN1, (, int, VAR1, ,, int, VAR2, ,, int, *, ..."
2,"[(x, [tensor([ 6.6000e+01, 1.6988e-03, 9.591...",0,"[VAR1, (, JNIEnv, *, VAR2, ,, jclass, VAR3, ,,..."
3,"[(x, [tensor([ 6.6000e+01, 1.6988e-03, 9.591...",0,"[FUN1, (, const, libcec_alert, VAR1, ,, const,..."
4,"[(x, [tensor([ 6.6000e+01, 2.8835e-04, 1.114...",0,"[FUN1, (, int, VAR1, ,, Datum, VAR2, ), {, FIL..."
...,...,...,...
826,"[(x, [tensor([ 6.6000e+01, 2.9198e-03, 4.542...",1,"[FUN1, (, char, *, VAR1, ,, int, class, ), {, ..."
827,"[(x, [tensor([ 6.6000e+01, 1.4122e-03, 1.009...",1,"[FUN1, (, Btree, *, VAR1, ,, MemPage, *, VAR2,..."
828,"[(x, [tensor([ 6.6000e+01, 1.8965e-03, 9.435...",1,"[FUN1, (, cpl_table, *, VAR1, ,, const, cpl_pr..."
829,"[(x, [tensor([ 6.6000e+01, 1.9931e-03, 9.583...",1,"[FUN1, (, int, VAR1, ,, char, *, VAR2, ), {, V..."


In [42]:
class InputDataset(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        # return self.dataset.iloc[index].input
        data = self.dataset.iloc[index].input
        data.func = self.dataset.iloc[index].func  # 添加现有的func属性, 为 codebert 的输入准备
        return data

    def get_loader(self, batch_size, shuffle=True):
        return DataLoader(dataset=self, batch_size=batch_size, shuffle=shuffle)

In [43]:
train_loader, val_loader, test_loader = list(
        map(lambda x: x.get_loader(context.batch_size, shuffle=context.shuffle),
            train_val_test_split(dataset, shuffle=context.shuffle)))

Splitting Dataset


In [44]:
for i, batch in enumerate(train_loader):
    print("Batch", i)
    print(batch)
    break;

Batch 0
DataBatch(x=[1640, 101], edge_index=[2, 933], y=[8], func=[8], batch=[1640], ptr=[9])


In [45]:
batch

DataBatch(x=[1640, 101], edge_index=[2, 933], y=[8], func=[8], batch=[1640], ptr=[9])

In [46]:
x, edge_index, func = batch.x, batch.edge_index, batch.func

In [47]:
func[0]
result = ''.join(func[0])
print(result)

FUN1(intVAR1,intVAR2,intVAR3,intVAR4){intVAR5;FUN2(,VAR1,VAR2,VAR3,VAR4);while(VAR4>0){VAR5=VAR3;while(VAR5>0){FUN3(VAR2+VAR4-2,VAR1+VAR5-2,);VAR5--;}VAR4--;}}


In [48]:
text = []
for var in func:
    result = ''.join(var)
    text.append(result)

In [55]:
out = model(input_ids.to("cuda"), attention_mask.to("cuda"))
out

tensor([[0.7876, 0.2214],
        [0.7635, 0.2402],
        [0.8344, 0.1882],
        [0.6971, 0.3014],
        [0.6575, 0.2543],
        [0.7082, 0.3818],
        [0.7562, 0.2437],
        [0.7343, 0.3045]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [56]:
th.nn.Softmax(dim=1)(out)

tensor([[0.6379, 0.3621],
        [0.6279, 0.3721],
        [0.6562, 0.3438],
        [0.5977, 0.4023],
        [0.5994, 0.4006],
        [0.5809, 0.4191],
        [0.6254, 0.3746],
        [0.6058, 0.3942]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [57]:
torch.sigmoid(out)

tensor([[0.6873, 0.5551],
        [0.6821, 0.5598],
        [0.6973, 0.5469],
        [0.6676, 0.5748],
        [0.6587, 0.5632],
        [0.6700, 0.5943],
        [0.6805, 0.5606],
        [0.6757, 0.5755]], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [71]:
out.size()

torch.Size([8, 2])

In [64]:
batch.y

tensor([0., 0., 0., 0., 0., 0., 0., 0.])