In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


from __future__ import absolute_import, division, print_function

import argparse
from calendar import c
import glob
import logging
import os
from collections import Counter
import sys
import pickle
import random
import re
import shutil
from sklearn.metrics import f1_score
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler,OneSidedSelection
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
import torch.nn.functional as F
import json
from tqdm import tqdm, trange
import multiprocessing
from model import Model, Model_BCE
cpu_cont = multiprocessing.cpu_count()
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          BertConfig, BertForMaskedLM, BertTokenizer,
                          GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                          OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                          DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)


In [2]:
class InputFeatures(object):
    """A single training/test features for a example."""

    def __init__(self,
                 input_tokens,
                 input_ids,
                 idx,
                 label,

                 ):
        self.input_tokens = input_tokens
        self.input_ids = input_ids
        self.idx = str(idx)
        self.label = label
def convert_examples_to_features(js, tokenizer, args):
    # source
    code = ' '.join(js['func'].split())
    code_tokens = tokenizer.tokenize(code)[:args.block_size - 2]
    source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    padding_length = args.block_size - len(source_ids)
    source_ids += [tokenizer.pad_token_id] * padding_length
    return InputFeatures(source_tokens, source_ids, js['idx'], js['target']),code

class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path=None):
        label_count = []
        self.examples = []
        self.codes = []
        with open(file_path) as f:
            for line in f:
                js = json.loads(line.strip())
                example,code = convert_examples_to_features(js, tokenizer, args)
                label_count.append(example.label)
                self.examples.append(example)
                self.codes.append(code)
#         logger.info("label ratio: {}".format(Counter(label_count)))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i].input_ids), torch.tensor(self.examples[i].label)

In [3]:
output_dir = 'msr_outout/ros_2x/checkpoint-best/model.bin'
test_data_file = '../msr_dataset/origin/data_split_0/test.jsonl'
msr_line_file = '../msr_dataset/msr_line.jsonl'
config_class, model_class, tokenizer_class = RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer

In [4]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [5]:
config = config_class.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
tokenizer = tokenizer_class.from_pretrained('microsoft/codebert-base')

In [6]:
model = model_class(config)

In [7]:
class Args:
    block_size = 400
    
args = Args()

In [8]:
model = Model_BCE(model,config, tokenizer, args=None)

In [9]:
model.load_state_dict(torch.load(output_dir))
model.to(device)
model.eval()

Model_BCE(
  (encoder): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_

In [10]:
def get_js_with_index(index_i,test_data_file):
    with open(test_data_file) as f:
        line = f.readlines()[index_i]
        js = json.loads(line.strip())
        return js

def get_line_with_index(index_i,msr_line_file):
    with open(msr_line_file) as f:
        line = f.readlines()[index_i]
        js = json.loads(line.strip())
        return js['lines_before']
    
def func_to_input(func):
    code = ' '.join(func.split())
    code_tokens = tokenizer.tokenize(code)[:args.block_size - 2]
    source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
    source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
    padding_length = args.block_size - len(source_ids)
    source_ids += [tokenizer.pad_token_id] * padding_length
    return source_ids

In [11]:
# get_js_with_index(10905,test_data_file)

In [12]:
with open(test_data_file) as f:
    total_test = len(f.readlines())

In [13]:
class model_pipeline:
    def __init__(self,model):
        self.model = model
    def pred(self,funcs):
        ret_probs = []
        dataloader = DataLoader(funcs, batch_size=32, shuffle=False)
        for funcs in dataloader:
            input_idxs = []
            for index,func in enumerate(funcs):
                input_idx = func_to_input(func)
                input_idxs.append(input_idx)
            model_input = torch.tensor(input_idxs,device=device)
            model_output = self.model(model_input).cpu().detach().numpy()
            ret_probs.append(model_output)
            del model_input,model_output
        clf_outputs = np.row_stack(ret_probs)
        diff = 1-clf_outputs
        ret_prob = np.concatenate((diff,clf_outputs),axis=1)
        return np.array(ret_prob)
    

In [14]:
pipeline = model_pipeline(model)

In [15]:
from lime.lime_text import LimeTextExplainer
class_names = ['non_vul', 'vul']

In [None]:
prob_list = []
exp_tup_list = []
line_list = []
index_list = []
for test_index in tqdm(range(0,total_test)):
    js = get_js_with_index(test_index,test_data_file)
    idx = js['idx']
    if js['target'] == 1: # for all Vulnearble
        line = get_line_with_index(idx,msr_line_file)
        if line != None:
            func = js['func']
            prob = pipeline.pred([func])
            if prob[0][1] > 0.5:
                index_list.append(test_index)
                try: 
                    print(f'{len(prob_list)} true positive {prob[0][1]}, curr: {test_index}/{total_test}')
                    # true positive
                    # # now see if he really see the importance vulnerable line!!!!
                    explainer = LimeTextExplainer(class_names=class_names)
                    exp = explainer.explain_instance(func, pipeline.pred, num_features=len(func.split()))
                    # break
                    exp = exp.as_list()
                    exp_tup_list.append(exp)
                    line_list.append(line)
                    prob_list.append(prob[0][1])
                    index_list.append(test_index)
                except:
                    print(f'error in {func} - {test_index}')
            

  0%|                                                                                                 | 29/37728 [00:01<24:44, 25.39it/s]

0 true positive 0.995524525642395, curr: 30/37728


  0%|▏                                                                                                | 84/37728 [00:38<41:07, 15.25it/s]

1 true positive 0.9994556307792664, curr: 84/37728


  0%|▎                                                                                             | 134/37728 [01:22<1:01:28, 10.19it/s]

2 true positive 0.998969316482544, curr: 138/37728


  0%|▎                                                                                             | 134/37728 [01:40<1:01:28, 10.19it/s]

In [None]:
# with open(f'exp_output/2x_TP_list.txt', 'w') as f:
#     f.write(str(index_list))

In [None]:
# len(index_list)

In [None]:
# with open(f'exp_output/ros_TP_list.txt', 'r') as f:
#     x = f.read()

In [None]:
# len(x)

In [None]:
# list(set(index_list) & set(x))

In [None]:
zipped_result = zip(exp_tup_list,line_list,prob_list,index_list)
sorted_zip = sorted(zipped_result, key=lambda x: x[2],reverse=True)

In [None]:
import pickle
pickle.dump(sorted_zip, open( f'lime_output/msr_2x_zip_exp_line_prob.pkl', "wb" ))

In [None]:
# temp_list = []
# for index in index_list:
#     js = get_js_with_index(index,test_data_file)
#     if len(js['func'].splitlines()) <= 30:
#         temp_list.append(index)

In [None]:
# temp_list = [4993, 5210, 5258, 5274, 5355, 5417, 5510, 5677, 5708, 5798, 5935, 6040, 6351, 6370, 6380, 6493, 6775, 6810, 6836, 7049, 7417, 7533, 7592, 7601, 7620, 7718, 8373, 8434, 8449, 8477, 8560, 8655, 8693, 8694, 8792, 8813, 9046, 9295, 9879, 10049, 10057, 10150, 10245, 10252, 10310, 10357, 10399, 10507, 10519, 10553, 10634, 10740, 10905, 10959, 11042, 11054, 11274, 11467, 11475, 11749, 12003, 12220, 12372, 12437, 12642, 12660, 12680, 12899, 12915, 13103, 13104, 13197, 13279, 13712, 13743, 13753, 13766, 13783, 13976, 14331, 14367, 14769, 14858, 15212, 15267, 15351, 15615, 15961, 15979, 15991, 16030, 16147, 16183, 16208, 16261, 16274, 16328, 16384, 16507, 16647, 16714, 16797, 16828, 16892, 16922, 17011, 17059, 17089, 17095, 17126, 17472, 17607, 17675, 17710, 17776, 18033, 18067, 18112, 18403, 18612, 18814, 18816, 18911, 19038, 19241, 19252, 19460, 19468, 19498, 19575, 19605, 19656, 19667, 20357, 20393, 20430, 20466, 20661, 20878, 20965, 20974, 21004, 21087, 21183, 21289, 21517, 21574, 21833, 22065, 22083, 22151, 22331, 22373, 22563, 22601, 22778, 23036, 23398, 23482, 23838, 23898, 24184, 24834, 24901, 24904, 24920, 25005, 25171, 25192, 25254, 25351, 25445, 25447, 25450, 25453, 25507, 25910, 25979, 26102, 26427, 26578, 26743, 27176, 27378, 27636, 27773, 27815, 27880, 28100, 28119, 28140, 28299, 28737, 29013, 29154, 29181, 29314, 29741, 29927, 30021, 30030, 30127, 30270, 30537, 30830, 31000, 31160, 31168, 31341, 31511, 31558, 31562, 31572, 31617, 31799, 32000, 32260, 32308, 32361, 32639, 32692, 33200, 33325, 33348, 33410, 33510, 34104, 34113, 34144, 34210, 34236, 34238, 34651, 34700, 34992, 35028, 35079, 35087, 35143, 35358, 35506, 35580, 35800, 36061, 36249, 36460, 36548, 36650, 36670, 36727, 36802, 36903, 36943, 37072, 37368, 37458]

In [None]:
# for index in tqdm(temp_list):
#     js = get_js_with_index(index,test_data_file)
#     func = js['func']
#     line = get_line_with_index(js['idx'],msr_line_file)
#     explainer = LimeTextExplainer(class_names=class_names)
#     exp = explainer.explain_instance(func, pipeline.pred, num_features=len(func.split()))

#     exp.local_exp[1] = [tup for tup in exp.local_exp[1] if tup[1] > 0][:5]
#     hit = False
#     for exp_token,prob in exp.as_list():
#         if exp_token in line:
#             hit = True
#     if not hit:
#         break
#     # exp.save_to_file(f'exp_output/{index}_exp.html')
#     # with open(f'exp_output/{index}_line.txt', 'w') as f:
#     #     f.write(line)

In [None]:
# exp.as_list()

In [None]:
# exp.save_to_file(f'exp_output/failhit_{index}_exp.html')
# with open(f'exp_output/failhit_{index}_line.txt', 'w') as f:
#     f.write(line)

In [None]:
index