In [1]:
## 使用 pip 安装 git 中指定的 transformers 库的特定版本。
## 这个特定版本是通过指定的 GitHub pull request 获取的。
!pip install transformers==4.33.2  # we need latest transformers for this
!pip install peft
!pip install accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes
## 自动选择和加载适合特定模型的分词器
from transformers import AutoTokenizer
import transformers
import torch

#import accelerate

Collecting transformers==4.33.2
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.33.0
    Uninstalling transformers-4.33.0:
      Successfully uninstalled transformers-4.33.0
Successfully installed transformers-4.33.2
Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub>=0.17.0 (from peft)
  Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.4/346.4 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, peft
  Attempting un

In [2]:
#释放GPU
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()
import os
os.system("rm -rf /kaggle/working/checkpoint")





0

In [3]:
import torch

class Config(object):
    '''
    配置参数 
    '''
    def __init__(self):
        # 数据路径
        self.data_train_path = '/kaggle/working/data/data_code_train_0.pkl'
        self.data_test_path = '/kaggle/working/data/data_code_test_0.pkl'
        # 模型保存路径
        self.model_save_path = '/kaggle/working/checkpoint'

        # 模型测试路径
        self.data_test = '/kaggle/working/output'
        self.model_test_path = '/kaggle/working/checkpoint/epoch_2.pt'

        # 针对长度超过bert限制的buggy和fixed代码的截断方式
        self.cutMethod = 'headTail'
        # self.cutMethod = 'head'
        # self.cutMethod = 'tail'
        # self.cutMethod = 'mid'

        # buggy与fixed经过bert生成embedding的拼接方式
        self.splicingMethod = 'cat'
        # self.splicingMethod = 'add'
        # self.splicingMethod = 'sub'
        # self.splicingMethod = 'mul'
        # self.splicingMethod = 'mix'

        # GPU 配置使用检测
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.device_ids = [0]
        # 注意力头数
        self.num_attention_heads=8
        # GPU 是否使用cuda
        self.use_cuda = True

        # bert 预训练模型
#        self.model_path = 'bert-base-uncased'
#         self.model_path = 'microsoft/codebert-base'
#         self.model_path = 'microsoft/graphcodebert-base'

        # t5
        self.model_path = 'deepseek-ai/deepseek-coder-1.3b-base'

        # gpt2
        # self.model_path = 'gpt2'

        self.pretrained_model_path = ''
        # bert 是否冻结
        self.freeze_bert = False
        # 模型的最长输入，bert为512，longformer为4096
#        self.max_length = 4096
        self.max_length = 512

        # lstm 输入数据特征维度：Bert模型 token的embedding维度 = Bert模型后接自定义分类器（单隐层全连接网络）的输入维度
        self.input_size = 768
        # lstm 隐层维度
        self.hidden_size = 768
        # lstm 循环神经网络层数
        self.num_layers = 2
        # dropout：按一定概率随机将神经网络单元暂时丢弃，可以有效防止过拟合
        self.dropout = 0.5

        # linear 输入特征size
        self.num_classes = 1

        # epoch 整体训练次数
        self.num_epoch = 20
        # epoch 开始训练时已处于第几次，默认为0
        self.start_epoch = 0
        # batch 训练batch大小
        self.train_batch_size = 1
        # batch 测试batch大小
        self.test_batch_size = 1

In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import  StratifiedKFold

# 配置类
import tqdm
!pip install transformers
from transformers import AutoTokenizer

config = Config()
# 分词器
tokenizer = AutoTokenizer.from_pretrained(config.model_path)


def read_file_to_str(path):
    with open(path, 'r', encoding='utf8') as f:
        file_contents = f.read()
    f.close()
    return str(file_contents)


def to_data(root, output_path):
    buggy_arr, fixed_arr, label_arr = [], [], []

    for tran_dir in tqdm.tqdm(os.listdir(root)):
        sub_path = os.path.join(root, tran_dir)
        bug_path = os.path.join(sub_path, "bug.java")
        patch_path = os.path.join(sub_path, "fixed.java")
        if not os.path.exists(bug_path) or not os.path.exists(patch_path):
            continue
        bug_content, patch_content = read_file_to_str(bug_path), read_file_to_str(patch_path)
        tran_buggy_list, tran_fixed_list = tokenizer.tokenize(bug_content), tokenizer.tokenize(patch_content)
        is_correct = -1 if "INCORRECT" in tran_dir else 0
        tran_buggy = ' '.join(tran_buggy_list)
        tran_fixed = ' '.join(tran_fixed_list)
        if is_correct == -1:
            label = 0
        else:
            label = 1
        if tran_buggy in buggy_arr and tran_fixed in fixed_arr:
            continue
        buggy_arr.append(tran_buggy)
        fixed_arr.append(tran_fixed)
        label_arr.append(label)

    data = buggy_arr, fixed_arr, label_arr
    with open(output_path, 'wb') as f:
        pd.to_pickle(data, f)


def divide_dataset(data_path, output_path):
    '''划分训练数据与测试数据，采用5倍交叉验证的方式进行划分'''
    # 加载数据
    with open(data_path, 'rb') as f:
        buggy_arr, fixed_arr, label_arr = pd.read_pickle(f)
    texts_1, texts_2, labels = np.array(buggy_arr), np.array(fixed_arr), np.array(label_arr)

    # 划分训练与测试数据集
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    index = [[train, test] for train, test in skf.split(texts_1, labels)]
    for i in range(len(index)):
        train_index, test_index = index[i][0], index[i][1]
        train_texts_1, train_texts_2, train_labels = texts_1[train_index], texts_2[train_index], labels[train_index]
        test_texts_1, test_texts_2, test_labels = texts_1[test_index], texts_2[test_index], labels[test_index]
        # 保存第 i 份训练数据
        data_train_path = os.path.join(output_path, 'data_code_train_' + str(i) + '.pkl')
        data_train = np.array(train_texts_1), np.array(train_texts_2), np.array(train_labels)
        with open(data_train_path, 'wb') as f:
            pd.to_pickle(data_train, f)
        # 保存第 i 份测试数据
        data_test_path = os.path.join(output_path, 'data_code_test_' + str(i) + '.pkl')
        data_test = np.array(test_texts_1), np.array(test_texts_2), np.array(test_labels)
        with open(data_test_path, 'wb') as f:
            pd.to_pickle(data_test, f)


data_path = "/kaggle/working/data/data.pkl"
output_path = '/kaggle/working/data'
os.system("rm -rf {}".format(output_path))
os.system("mkdir {}".format(output_path))
to_data("/kaggle/input/patch-zip/patch",data_path)
divide_dataset(data_path, output_path)




tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

100%|██████████| 2277/2277 [01:00<00:00, 37.55it/s]


In [5]:
from torch.utils.data import Dataset
import torch


def remove_context(buggy_code, fixed_code):
    buggy_code = buggy_code.strip().split(" ")
    fixed_code = fixed_code.strip().split(" ")
    length1 = len(buggy_code)
    length2 = len(fixed_code)
    offset_head = 0
    offset_tail = 0
    while offset_head < min(length1, length2):
        if buggy_code[offset_head] == fixed_code[offset_head]:
            offset_head += 1
        else:
            break
    while offset_tail < min(length1, length2):
        if buggy_code[length1 - 1 - offset_tail] == fixed_code[length2 - 1 - offset_tail]:
            offset_tail += 1
        else:
            break
    s1 = ""
    s2 = ""
    if offset_head + offset_tail < length1:
        s1 = ' '.join(buggy_code[offset_head:length1 - offset_tail]).strip()
    if offset_head + offset_tail < length2:
        s2 = ' '.join(fixed_code[offset_head:length2 - offset_tail]).strip()
    return s1, s2


class MyDataset(Dataset):
    def __init__(self, func, tokenizer, max_length, texts_1, texts_2, labels,names=None):
        self.tokenizer = tokenizer
        self.func = func
        self.max_length = max_length
        self.texts_1 = texts_1
        self.texts_2 = texts_2
        self.labels = labels
        self.names=names

    def _encode(self, text):
        return self.func(text, self.tokenizer, self.max_length)

    def __getitem__(self, idx):
        text_1 = self.texts_1[idx]
        text_2 = self.texts_2[idx]
        label = self.labels[idx]
        encoding_1 = self._encode(text_1)
        encoding_2 = self._encode(text_2)

        item = dict()
        for key, val in encoding_1.items():
            if key.startswith("token_type_ids"):
                continue
            item[key + '_1'] = torch.tensor(val)
        for key, val in encoding_2.items():
            if key.startswith("token_type_ids"):
                continue
            item[key + '_2'] = torch.tensor(val)
        item['labels'] = torch.tensor(label)
        if self.names is not None:
            item["name"]=self.names[idx]
        return item

    def __len__(self):
        return len(self.labels)



In [6]:
import torch


def add(text1, text2):
    return torch.add(text1, text2)


def subtraction(text1, text2):
    return torch.sub(text1, text2)


def multiplication(text1, text2):
    return torch.mul(text1, text2)


def cosion(text1, text2):
    return torch.cosine_similarity(text1, text2)


def euclid(text1, text2):
    return torch.pairwise_distance(text1, text2)


def cat_features(text1, text2):
    addition = add(text1, text2)
    subtract = subtraction(text1, text2)
    multiple = multiplication(text1, text2)
    return torch.cat((text1, text2, addition, subtract, multiple), dim=2)


# 新段落

In [7]:
import torch.nn as nn

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

from transformers import AutoModelForCausalLM


class Model(torch.nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        self.splicingMethod = config.splicingMethod
        

#         base_model = "codellama/CodeLlama-7b-hf"
#         model = AutoModelForCausalLM.from_pretrained(
#             base_model,
#             torch_dtype=torch.float16,
#             device_map="auto",
#         )
        model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base", trust_remote_code=True)
        # 冻结参数
#         for name, parameter in model.named_parameters():
#             parameter.requires_grad = False

#         model = prepare_model_for_int8_training(model)

#         lora_config = LoraConfig(
#             r=8,
#             lora_alpha=16,
#             target_modules=[
#                 "q_proj",
#                 "k_proj",
#                 "v_proj",
#                 "o_proj",
#             ],
#             lora_dropout=0.01,
#             bias="none",
#             task_type="CAUSAL_LM",
#         )
#         model = get_peft_model(model, lora_config)

        # 预训练模型
        self.model = model

        self.fc = nn.Linear(2048 * 2, config.num_classes)

    def forward(self, bug_input_ids, bug_attention_mask, patch_input_ids, patch_attention_mask):
        bug_outputs = self.model(bug_input_ids, output_hidden_states=True)
        # 使用最后一层隐藏层作为Embedding
        bug_hidden_state = bug_outputs.hidden_states[1]
        # 使用Decoder的Embedding作为Embedding
        bug_hidden_state = bug_outputs.hidden_states[0]
        
        patch_outputs = self.model(patch_input_ids, output_hidden_states=True)
        # 使用最后一层隐藏层作为Embedding
        patch_hidden_state = patch_outputs.hidden_states[1]
        # 使用Decoder的Embedding作为Embedding
#         patch_hidden_state = patch_outputs.hidden_states[0]

        # 将两个向量进行拼接
        output = torch.cat((bug_hidden_state, patch_hidden_state), dim=2)
#         out=output[:,-1,:]
        # 求均值
#         out=torch.mean(output, dim=1)
        # MAX Pooling
        out,_=torch.max(output, dim=1)
        
        # 全连接
        out = self.fc(out).squeeze(1)
        return out



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /opt/conda/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 6.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so...


  warn(msg)
  warn(msg)


In [8]:
import numpy as np
from transformers import AutoTokenizer, AdamW
from torch.utils.data import DataLoader
import torch
import os
import shutil

import pandas as pd
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, precision_score


def tokenizer_head_tail(text, tokenizer, max_length):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_length:
        half_length = int(max_length / 2)
        encoding['input_ids'] = encoding['input_ids'][:half_length] + encoding['input_ids'][-half_length:]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:half_length] + encoding['token_type_ids'][-half_length:]
        encoding['attention_mask'] = encoding['attention_mask'][:half_length] + encoding['attention_mask'][
                                                                                -half_length:]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def tokenizer_head(text, tokenizer, max_legnth):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_legnth:
        encoding['input_ids'] = encoding['input_ids'][:max_legnth - 1] + encoding['input_ids'][-1:]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:max_legnth-1] + encoding['token_type_ids'][-1:]
        encoding['attention_mask'] = encoding['attention_mask'][:max_legnth - 1] + encoding['attention_mask'][-1:]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def tokenizer_tail(text, tokenizer, max_legnth):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_legnth:
        encoding['input_ids'] = encoding['input_ids'][:1] + encoding['input_ids'][-max_legnth + 1:]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:max_legnth-1] + encoding['token_type_ids'][-1:]
        encoding['attention_mask'] = encoding['attention_mask'][:1] + encoding['attention_mask'][-max_legnth + 1:]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding

def tokenizer_mid(text, tokenizer, max_legnth):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_legnth:
        encoding['input_ids'] = encoding['input_ids'][(len(encoding['input_ids']) - max_length) // 2: (len(encoding['input_ids']) + max_length) // 2]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:max_legnth-1] + encoding['token_type_ids'][-1:]
        encoding['attention_mask'] = encoding['attention_mask'][(len(encoding['input_ids']) - max_length) // 2: (len(encoding['input_ids']) + max_length) // 2]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def save(model, optimizer, PATH, index):
    # 保存模型参数
    if not os.path.exists(PATH):
        os.mkdir(PATH)
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, os.path.join(PATH, 'epoch_best.pt'))
    print("保存模型参数")


def load(model, PATH):
    checkpoint = torch.load(PATH)
    model.module.load_state_dict(checkpoint['model_state_dict'], False)
    print("加载further pretrained模型成功")
    return model


def evl(model, test_loader):
    y_true = []
    y_score = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids_1 = batch['input_ids_1'].to(device)
            attention_mask_1 = batch['attention_mask_1'].to(device)
            input_ids_2 = batch['input_ids_2'].to(device)
            attention_mask_2 = batch['attention_mask_2'].to(device)
            labels = batch['labels'].to(device)

            out = torch.sigmoid(model(input_ids_1, attention_mask_1, input_ids_2, attention_mask_2))

            y_true.append(labels.item())
            y_score.append(out.item())
            print("y_true={}, y_score={}".format(y_true[-1],y_score[-1]))

    fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_score, pos_label=1)
    auc_ = auc(fpr, tpr)
    y_pred = [1 if p >= 0.5 else 0 for p in y_score]
    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
    prc = precision_score(y_true=y_true, y_pred=y_pred)
    rc = recall_score(y_true=y_true, y_pred=y_pred)
    f1 = 2 * prc * rc / (prc + rc)
    print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_))
    return acc


def train(model, train_loader, test_loader, optim, loss_function, max_epoch, start_epoch, data_id):
    max_acc = 0
    print('-------------- start training ---------------', '\n')
    for epoch in range(max_epoch):
        # 从start_epoch开始
        if epoch < start_epoch:
            continue
        print("========= epoch:", epoch, '==============')
        step = 0
        losses = []
        for batch in train_loader:
            step += 1
            # 清空优化器
            optim.zero_grad()

            input_ids_1 = batch['input_ids_1'].to(device)
            attention_mask_1 = batch['attention_mask_1'].to(device)
            input_ids_2 = batch['input_ids_2'].to(device)
            attention_mask_2 = batch['attention_mask_2'].to(device)
            labels = batch['labels'].to(device)
            # 将数据输入模型，计算loss
            out = model(input_ids_1, attention_mask_1, input_ids_2, attention_mask_2)
            loss = loss_function(out, labels.float())

            print('[', step, '/', len(train_loader), ']', "loss:", format(loss.item(), '.10f'))
            losses.append(loss.item())

            # 反向传播
            loss.backward()
            optim.step()
        # 输出本次epoch的loss均值
        print(np.mean(losses))
        # test(model,test_loader,device=config.device)

        # 验证
        if epoch % 1 == 0:
            model.eval()
            acc = evl(model=model, test_loader=test_loader)
            model.train()
            if max_acc < acc:
                max_acc = acc
                save(model, optim, config.model_save_path, epoch)


if __name__ == '__main__':
    # 配置类
    config = Config()
    # 分词器
    tokenizer = AutoTokenizer.from_pretrained(config.model_path)
    tokenizer.pad_token = tokenizer.eos_token
    # 模型最长输入
    max_length = config.max_length

    # 加载数据
    for i in range(1):
        data_train_path = config.data_train_path
        data_test_path = config.data_test_path
        with open(data_train_path, 'rb') as f:
            train_texts_1, train_texts_2, train_labels = pd.read_pickle(f)
            train_texts_1 = list(train_texts_1)
            train_texts_2 = list(train_texts_2)
            train_labels = list(train_labels)
            train_texts_1 = [text.lower() for text in train_texts_1]
            train_texts_2 = [text.lower() for text in train_texts_2]
            # 过拟合检测/正确补丁检测
            train_labels = [0 if label == 1 else 1 for label in train_labels]
        with open(data_test_path, 'rb') as f:
            test_texts_1, test_texts_2, test_labels = pd.read_pickle(f)
            test_texts_1 = list(test_texts_1)
            test_texts_2 = list(test_texts_2)
            test_labels = list(test_labels)
            test_texts_1 = [text.lower() for text in test_texts_1]
            test_texts_2 = [text.lower() for text in test_texts_2]
            # 过拟合检测/正确补丁检测
            test_labels = [0 if label == 1 else 1 for label in test_labels]
        print("训练集:", len(train_labels))
        print("测试集:", len(test_labels))

        tokenizer_func = {'headTail': tokenizer_head_tail, 'head': tokenizer_head, 'tail': tokenizer_tail, 'mid': tokenizer_mid}
        train_dataset = MyDataset(tokenizer_func[config.cutMethod], tokenizer, max_length, train_texts_1, train_texts_2, train_labels)
        test_dataset = MyDataset(tokenizer_func[config.cutMethod], tokenizer, max_length, test_texts_1, test_texts_2, test_labels)

        # 生成训练和测试Dataloader
        train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=config.test_batch_size, shuffle=True)

        # 模型
        model = Model(config)
        device=config.device
        # 定义GPU/CPU
        model.to(device)
        # 多GPU并行
#         model = torch.nn.DataParallel(model, device_ids=config.device_ids)
        #    model = torch.nn.DataParallel(model)
        # 加载已有模型参数
        if config.start_epoch > 0:
            model = load(model, config.pretrained_model_path)
        # 训练模式
        model.train()
        # 训练次数
        max_epoch = config.num_epoch
        # 开始训练是第几轮
        start_epoch = config.start_epoch
        # 优化器
        optim = AdamW(model.parameters(), lr=5e-5)
        # 损失函数
        loss_function = torch.nn.BCEWithLogitsLoss()

        # 开始训练
        train(model=model, train_loader=train_loader, test_loader=test_loader, optim=optim, loss_function=loss_function,
              max_epoch=max_epoch, start_epoch=start_epoch, data_id=str(i))








训练集: 1815
测试集: 454


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]



-------------- start training --------------- 

[ 1 / 1815 ] loss: 0.5170321465
[ 2 / 1815 ] loss: 1.1076493263
[ 3 / 1815 ] loss: 0.4292587936
[ 4 / 1815 ] loss: 0.4283834994
[ 5 / 1815 ] loss: 0.3963963985
[ 6 / 1815 ] loss: 1.2995798588
[ 7 / 1815 ] loss: 1.2193784714
[ 8 / 1815 ] loss: 1.1656150818
[ 9 / 1815 ] loss: 1.0323544741
[ 10 / 1815 ] loss: 0.5087714791
[ 11 / 1815 ] loss: 0.8811730146
[ 12 / 1815 ] loss: 0.7511590719
[ 13 / 1815 ] loss: 0.6825889349
[ 14 / 1815 ] loss: 0.5940843225
[ 15 / 1815 ] loss: 0.9782400727
[ 16 / 1815 ] loss: 0.4429288805
[ 17 / 1815 ] loss: 1.0815622807
[ 18 / 1815 ] loss: 1.1139416695
[ 19 / 1815 ] loss: 1.1626797915
[ 20 / 1815 ] loss: 1.1252602339
[ 21 / 1815 ] loss: 1.0390474796
[ 22 / 1815 ] loss: 0.9363522530
[ 23 / 1815 ] loss: 0.8767023087
[ 24 / 1815 ] loss: 0.8543988466
[ 25 / 1815 ] loss: 0.6751266122
[ 26 / 1815 ] loss: 0.7144076824
[ 27 / 1815 ] loss: 0.6871364117
[ 28 / 1815 ] loss: 0.5848787427
[ 29 / 1815 ] loss: 0.7375845313
[ 30

Token indices sequence length is longer than the specified maximum sequence length for this model (28862 > 16384). Running this sequence through the model will result in indexing errors


[ 72 / 1815 ] loss: 0.5031528473
[ 73 / 1815 ] loss: 0.5081035495
[ 74 / 1815 ] loss: 0.5071948171
[ 75 / 1815 ] loss: 0.5020895600
[ 76 / 1815 ] loss: 0.4625045061
[ 77 / 1815 ] loss: 0.5317173004
[ 78 / 1815 ] loss: 0.6539822221
[ 79 / 1815 ] loss: 0.9199340343
[ 80 / 1815 ] loss: 1.0473511219
[ 81 / 1815 ] loss: 0.9192110300
[ 82 / 1815 ] loss: 0.7171917558
[ 83 / 1815 ] loss: 0.5156245232
[ 84 / 1815 ] loss: 0.4354316592
[ 85 / 1815 ] loss: 0.5269269347
[ 86 / 1815 ] loss: 0.9063034058
[ 87 / 1815 ] loss: 0.6431201696
[ 88 / 1815 ] loss: 0.5426645279
[ 89 / 1815 ] loss: 0.4374531806
[ 90 / 1815 ] loss: 0.6701725125
[ 91 / 1815 ] loss: 0.7366411090
[ 92 / 1815 ] loss: 0.8331606388
[ 93 / 1815 ] loss: 0.8673111200
[ 94 / 1815 ] loss: 0.6623965502
[ 95 / 1815 ] loss: 0.6038627625
[ 96 / 1815 ] loss: 0.5986911654
[ 97 / 1815 ] loss: 0.8337820768
[ 98 / 1815 ] loss: 0.4650237560
[ 99 / 1815 ] loss: 0.7261592746
[ 100 / 1815 ] loss: 0.9382820129
[ 101 / 1815 ] loss: 0.9323613644
[ 102 / 

In [None]:
from transformers import AutoTokenizer,AutoModel
import transformers
import torch

model="codellama/CodeLlama-7b-hf" ## 感兴趣的可以换成其他模型试试

tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.eos_token_id)

model = AutoModel.from_pretrained(model)

# 输入文本
text = "This is an example sentence."

# 对输入文本进行编码
input_ids = tokenizer.encode(text, return_tensors="pt",padding=True)
print(input_ids)
print(input_ids.shape)
# 获取模型的输出
with torch.no_grad():
    outputs = model(input_ids)

# 提取最后一层隐藏层向量
last_hidden_state = outputs.last_hidden_state
print(last_hidden_state.shape)

# 获取 EOS 标记的 ID
eos_token_id = tokenizer.eos_token_id

# 寻找 EOS 标记在输入序列中的位置
print((input_ids == eos_token_id).nonzero().shape)
print((input_ids == eos_token_id).nonzero())
eos_position = (input_ids == eos_token_id).nonzero()[0].item()

# 获取 EOS 对应的最后一层隐藏层向量
eos_hidden_vector = last_hidden_state[:, -1, :]

print("EOS hidden vector:", eos_hidden_vector)


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

2


config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer, utils

from torch.utils.data import DataLoader
import torch
import pandas as pd
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, precision_score


def tokenizer_head_tail(text, tokenizer, max_length):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_length:
        half_length = int(max_length / 2)
        encoding['input_ids'] = encoding['input_ids'][:half_length] + encoding['input_ids'][-half_length:]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:half_length] + encoding['token_type_ids'][-half_length:]
        encoding['attention_mask'] = encoding['attention_mask'][:half_length] + encoding['attention_mask'][
                                                                                -half_length:]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def tokenizer_head(text, tokenizer, max_legnth):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_legnth:
        encoding['input_ids'] = encoding['input_ids'][:max_legnth - 1] + encoding['input_ids'][-1:]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:max_legnth-1] + encoding['token_type_ids'][-1:]
        encoding['attention_mask'] = encoding['attention_mask'][:max_legnth - 1] + encoding['attention_mask'][-1:]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def tokenizer_tail(text, tokenizer, max_legnth):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_legnth:
        encoding['input_ids'] = encoding['input_ids'][:1] + encoding['input_ids'][-max_legnth + 1:]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:max_legnth-1] + encoding['token_type_ids'][-1:]
        encoding['attention_mask'] = encoding['attention_mask'][:1] + encoding['attention_mask'][-max_legnth + 1:]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def tokenizer_mid(text, tokenizer, max_legnth):
    encoding = tokenizer(text, padding=True)
    if len(encoding['input_ids']) > max_legnth:
        encoding['input_ids'] = encoding['input_ids'][(len(encoding['input_ids']) - max_length) // 2: (len(
            encoding['input_ids']) + max_length) // 2]
        #        encoding['token_type_ids'] = encoding['token_type_ids'][:max_legnth-1] + encoding['token_type_ids'][-1:]
        encoding['attention_mask'] = encoding['attention_mask'][(len(encoding['input_ids']) - max_length) // 2: (
                                                                                                                        len(
                                                                                                                            encoding[
                                                                                                                                'input_ids']) + max_length) // 2]
        # encoding.pop('token_type_ids')
    else:
        encoding['input_ids'] = encoding['input_ids'] + [0 for i in range(len(encoding['input_ids']), max_length)]
        encoding['attention_mask'] = encoding['attention_mask'] + [0 for i in
                                                                   range(len(encoding['attention_mask']), max_length)]
        # encoding.pop('token_type_ids')
    return encoding


def load(model, PATH):
    checkpoint = torch.load(PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model


data=[]
def test(model, test_loader, device):
    y_true = []
    y_score = []
    attention_score=[]
    p_top,p_bottom=0.5,-1
    with torch.no_grad():
        for batch in test_loader:
            input_ids_1 = batch['input_ids_1'].to(device)
            attention_mask_1 = batch['attention_mask_1'].to(device)
            input_ids_2 = batch['input_ids_2'].to(device)
            attention_mask_2 = batch['attention_mask_2'].to(device)
            labels = batch['labels'].to(device)
            names = batch['name']

            out, bug_attentions, patch_attentions, bug_tokens, patch_tokens = model(input_ids_1, attention_mask_1,
                                                                                    input_ids_2, attention_mask_2,
                                                                                    output_attentions=True,
                                                                                    output_tokens=True)

            out = torch.sigmoid(out)

            y_true.append(labels.item())
            y_score.append(out.item())

            # 计算attention
            bug_attention_result=cal_attention(attentions=bug_attentions[0][0],tokens=bug_tokens[0])
            
            patch_attention_result=cal_attention(attentions=patch_attentions[0][0],tokens=patch_tokens[0])
            
#             print(bug_attention_result)
#             print(patch_attention_result,names[0])
            bug_sentences=[x[0] for x in bug_attention_result]
            patch_sentences=[x[0] for x in patch_attention_result]
            bug_attention_rank,patch_attention_rank=-1,-1
            
            print("-------------------------")
            
            for i,pa in enumerate(patch_attention_result):
                if i>=len(bug_sentences) or pa[0] != bug_sentences[i]:
                    print(i,"/",len(patch_attention_result))
                    patch_attention_rank=i
#                     data.append(i)
                    break
            for i,pa in enumerate(bug_attention_result):
                if i>=len(patch_sentences) or pa[0] != patch_sentences[i]:
                    print(i,"/",len(bug_attention_result))
                    bug_attention_rank=i
                    data.append(i)
                    break
            if bug_attention_rank<0 or patch_attention_rank<0:
                attention_score.append(0.5)
            else:
                attention_score.append((bug_attention_rank/len(bug_attention_result)+patch_attention_rank/len(patch_attention_result))/2)
            print(attention_score[-1])
            

            # attention可视化
            # from bertviz import model_view
            # 可视化第一个
            # model_view(patch_attentions[0], patch_tokens[0])

    fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_score, pos_label=1)
    auc_ = auc(fpr, tpr)
    y_pred = [1 if  x>= 0.5 else 0 for x in y_score]
    y_pred_new = []
    for i in range(len(y_score)):
        if attention_score[i]>p_top:
            y_pred_new.append(1)
        elif attention_score[i]<p_bottom:
            y_pred_new.append(0)
        elif y_score[i]>=0.5:
            y_pred_new.append(1)
        else:
            y_pred_new.append(0)
    
    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
    prc = precision_score(y_true=y_true, y_pred=y_pred)
    rc = recall_score(y_true=y_true, y_pred=y_pred)
    f1 = 2 * prc * rc / (prc + rc)
    acc_new = accuracy_score(y_true=y_true, y_pred=y_pred_new)
    prc_new = precision_score(y_true=y_true, y_pred=y_pred_new)
    rc_new = recall_score(y_true=y_true, y_pred=y_pred_new)
    f1_new = 2 * prc * rc / (prc + rc)
    print('Accuracy: %f -- Precision: %f -- Recall: %f -- F1: %f -- AUC: %f' % (acc, prc, rc, f1, auc_))
    print('Accuracy_new: %f -- Precision_new: %f -- Recall_new: %f -- F1_new: %f -- AUC_new: %f' % (acc_new, prc_new, rc_new, f1_new, auc_))


def cal_attention(attentions, tokens):
    # take from tuple then take out mini-batch attention values
    attention = None
    # go into the layer
    tokens = [token.replace("Ġ", " ") for token in tokens]
    tokens = [token.replace("ĉ", "Ċ") for token in tokens]
    tokens = [token.replace("<s>", "") for token in tokens]
    tokens = [token.replace("</s>", "") for token in tokens]
    tokens = [token.replace("<pad>", "") for token in tokens]
    tokens = [token.replace(" Ä", "") for token in tokens]
    tokens = [token.replace("¡", " ") for token in tokens]
    
    for i in range(len(attentions)):
        layer_attention = attentions[i]
        # summerize the values of each token dot other tokens
        layer_attention = sum(layer_attention)
        if attention is None:
            attention = layer_attention
        else:
            attention += layer_attention
    # clean att score for <s> and </s>
    attention = clean_special_token_values(attention, padding=True)
    # attention should be 1D tensor with seq length representing each token's attention value
    word_att_scores = get_word_att_scores(all_tokens=tokens, att_scores=attention)
    all_lines_score = get_all_lines_score(word_att_scores)
    return all_lines_score


def clean_special_token_values(all_values, padding=False):
    # special token in the beginning of the seq
    all_values[0] = 0
    if padding:
        # get the last non-zero value which represents the att score for </s> token
        idx = [index for index, item in enumerate(all_values) if item != 0][-1]
        all_values[idx] = 0
    else:
        # special token in the end of the seq
        all_values[-1] = 0
    return all_values


def get_word_att_scores(all_tokens: list, att_scores: list) -> list:
    word_att_scores = []
    for i in range(len(all_tokens)):
        token, att_score = all_tokens[i], att_scores[i]
        word_att_scores.append([token, att_score])
    return word_att_scores


def get_all_lines_score(word_att_scores: list):
    # word_att_scores -> [[token, att_value], [token, att_value], ...]
    separator = ["Ċ", " Ċ", "ĊĊ", " ĊĊ","ĭ"]
    # to return
    all_lines_score = []
    score_sum = 0
    line_idx = 0
    line = ""
    last_token_idx=0
    for i in range(len(word_att_scores)):
        # summerize if meet line separator or the last token
        if ((word_att_scores[i][0] in separator) or (i == (len(word_att_scores) - 1))) and score_sum != 0:
            score_sum += word_att_scores[i][1]
            line_length=i-last_token_idx
            all_lines_score.append([line, float(score_sum.item())/(line_length), line_idx])
            line = ""
            score_sum = 0
            line_idx += 1
            last_token_idx=i
        # else accumulate score
        elif word_att_scores[i][0] not in separator:
            line += word_att_scores[i][0]
            score_sum += word_att_scores[i][1]
    return sorted(all_lines_score, key=lambda x: x[1], reverse=True)


if __name__ == '__main__':
    # 配置类
    config = Config()
    # 分词器
    tokenizer = AutoTokenizer.from_pretrained(config.model_path)
    # 模型最长输入
    max_length = config.max_length

    data_test_path = config.data_test_path
    with open(data_test_path, 'rb') as f:
        test_texts_1, test_texts_2, test_labels, test_names = pd.read_pickle(f)
        test_texts_1 = list(test_texts_1)
        test_texts_2 = list(test_texts_2)
        test_labels = list(test_labels)
        test_names = list(test_names)
        test_texts_1 = [text.lower() for text in test_texts_1]
        test_texts_2 = [text.lower() for text in test_texts_2]
        # 过拟合检测/正确补丁检测
        test_labels = [0 if label == 1 else 1 for label in test_labels]

    tokenizer_func = {'headTail': tokenizer_head_tail, 'head': tokenizer_head, 'tail': tokenizer_tail,
                      'mid': tokenizer_mid}
    test_dataset = MyDataset(tokenizer_func[config.cutMethod], tokenizer, max_length, test_texts_1, test_texts_2,
                             test_labels, test_names)

    # 生成训练和测试Dataloader
    test_loader = DataLoader(test_dataset, batch_size=config.test_batch_size, shuffle=True)

    # 模型
    model = Model(config,tokenizer=tokenizer)
    model = load(model, config.model_test_path)
    # 定义GPU/CPU
    device = config.device
    model.to(device)
#     # 多GPU并行
#     model = torch.nn.DataParallel(model, device_ids=config.device_ids)

    # 测试
    test(model, test_loader, device=device)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# # 生成一组随机数据
# data = np.random.randn(100)
fig, ax = plt.subplots()

# 绘制一维散点图
ax.scatter(range(len(data)), data)

# 设置图表标题和坐标轴标签
plt.xlabel('Index')
plt.ylabel('line')
ax.set_yticks(range(0, 30, 1))

# 显示图表
plt.show()