In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


from __future__ import absolute_import, division, print_function

import argparse
from calendar import c
import glob
import logging
import os
from collections import Counter
import sys
import pickle
import random
import re
import shutil
from sklearn.metrics import f1_score
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler,OneSidedSelection
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
import torch.nn.functional as F
import json
from tqdm import tqdm, trange
import multiprocessing
from model import Model, Model_BCE
cpu_cont = multiprocessing.cpu_count()
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          BertConfig, BertForMaskedLM, BertTokenizer,
                          GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                          OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                          DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)


In [2]:
class InputFeatures(object):
    """A single training/test features for a example."""

    def __init__(self,
                 input_tokens,
                 input_ids,
                 idx,
                 label,

                 ):
        self.input_tokens = input_tokens
        self.input_ids = input_ids
        self.idx = str(idx)
        self.label = label
def convert_examples_to_features(js, tokenizer, args):
    # source
    code = ' '.join(js['func'].split())
    code_tokens = tokenizer.tokenize(code)[:args.block_size - 2]
    source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    padding_length = args.block_size - len(source_ids)
    source_ids += [tokenizer.pad_token_id] * padding_length
    return InputFeatures(source_tokens, source_ids, js['idx'], js['target']),code

class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path=None):
        label_count = []
        self.examples = []
        self.codes = []
        with open(file_path) as f:
            for line in f:
                js = json.loads(line.strip())
                example,code = convert_examples_to_features(js, tokenizer, args)
                label_count.append(example.label)
                self.examples.append(example)
                self.codes.append(code)
#         logger.info("label ratio: {}".format(Counter(label_count)))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i].input_ids), torch.tensor(self.examples[i].label)

In [3]:
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_visible_devices(devices=gpus[1], device_type='GPU')
tf.config.experimental.set_memory_growth(gpus[1],True)
clf = tf.keras.models.load_model('lime_output/latent_clf/oss_keras_model.pkl')

output_dir = 'msr_output/origin/saved_models_5/checkpoint-best/model.bin'
test_data_file = '../msr_dataset/origin/data_split_5/test.jsonl'
msr_line_file = '../msr_dataset/msr_line.jsonl'


config_class, model_class, tokenizer_class = RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

config = config_class.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
tokenizer = tokenizer_class.from_pretrained('microsoft/codebert-base')

model = model_class(config)

class Args:
    block_size = 400
    
args = Args()
model = Model_BCE(model,config, tokenizer, args=None)


model.load_state_dict(torch.load(output_dir, map_location=device))


model.to(device)


2022-08-11 21:32:46.211493: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 21:32:46.212472: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 21:32:46.213452: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 21:32:46.219041: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 21:32:46.220005: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

Model_BCE(
  (encoder): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_

In [24]:
def get_js_with_index(index_i,test_data_file):
    with open(test_data_file) as f:
        line = f.readlines()[index_i]
        js = json.loads(line.strip())
        return js

def get_line_with_index(index_i,msr_line_file):
    with open(msr_line_file) as f:
        line = f.readlines()[index_i]
        js = json.loads(line.strip())
        return js['lines_before']
    
def func_to_input(func):
    code = ' '.join(func.split())
    code_tokens = tokenizer.tokenize(code)[:args.block_size - 2]
    source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    padding_length = args.block_size - len(source_ids)
    source_ids += [tokenizer.pad_token_id] * padding_length
    return source_ids

with open(test_data_file) as f:
    total_test = len(f.readlines())


class model_pipeline:
    def __init__(self,model,clf):
        self.model = model
        self.clf = clf
    def pred(self,funcs):
        # print(f'batch size:{len(funcs)}')
        clf_outputs = []
        dataloader = DataLoader(funcs, batch_size=32, shuffle=False)
        for funcs in dataloader:
            input_idxs = []
            for index,func in enumerate(funcs):
                input_idx = func_to_input(func)
                input_idxs.append(input_idx)
            model_input = torch.tensor(input_idxs,device=device)
            model_output = self.model(model_input,get_latent=True)
            vector = model_output[0][:, 0, :].cpu().detach().numpy()
            clf_output = self.clf.predict(vector,verbose=0)
            clf_outputs.append(clf_output)
            del model_input,model_output
        clf_outputs = np.row_stack(clf_outputs)
        diff = 1-clf_outputs
        ret_prob = np.concatenate((diff,clf_outputs),axis=1)
        return ret_prob

In [27]:
pipeline = model_pipeline(model,clf)

from lime.lime_text import LimeTextExplainer
class_names = ['non_vul', 'vul']
prob_list = []
exp_tup_list = []
line_list = []
for test_index in range(0,total_test):
    js = get_js_with_index(test_index,test_data_file)
    idx = js['idx']
    if js['target'] == 1: # for all Vulnearbles
        line = get_line_with_index(idx,msr_line_file)
        if line != None:
            # print(line)
            func = js['func']
            prob = pipeline.pred([func])
            # break
            if prob[0][1] > 0.5:
                # try:
                    print(f'{len(prob_list)} true positive {prob[0][1]}, curr: {test_index}')
                    # print(func)
                    print(len(func.split()))
                    # true positive
                    # now see if he really see the importance vulnerable line!!!!
                    explainer = LimeTextExplainer(class_names=class_names)
                    exp = explainer.explain_instance(func, pipeline.pred, num_features=len(func.split()))
                    break

batch size:1
[[0.4876389]]
[[0.5123611]]
(1, 2)
[[0.5123611 0.4876389]]
batch size:1
[[0.76221186]]
[[0.23778814]]
(1, 2)
[[0.23778814 0.76221186]]
0 true positive 0.7622118592262268, curr: 27
16
batch size:5000
[[0.4249147 ]
 [0.44021294]
 [0.46523538]
 ...
 [0.34916142]
 [0.48503193]
 [0.42332944]]
[[0.5750853 ]
 [0.55978703]
 [0.53476465]
 ...
 [0.6508386 ]
 [0.51496804]
 [0.5766705 ]]
(5000, 2)
[[0.5750853  0.4249147 ]
 [0.55978703 0.44021294]
 [0.53476465 0.46523538]
 ...
 [0.6508386  0.34916142]
 [0.51496804 0.48503193]
 [0.5766705  0.42332944]]


In [None]:
exp.as_pyplot_figure()