In [1]:
import copy
import json
import os
import re
import sys
import argparse
import torch
from tqdm import tqdm
from peft import PeftModel,PeftConfig
from tqdm import tqdm
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer

In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [34]:
with open(r'C:\Jupyter\大模型\DoRA-main\commonsense_reasoning\dataset\openbookqa\test.json', 'r') as file:
    dataset = json.load(file)

In [4]:
dataset[50]

{'instruction': 'Please choose the correct answer to the question: Some berries may be eaten by\n\nAnswer1: a bear or person Answer2: a bear or shark Answer3: a bear or lion Answer4: a bear or wolf\n\nAnswer format: answer1/answer2/answer3/answer4',
 'input': '',
 'output': 'the correct answer is answer1',
 'answer': 'answer1'}

In [5]:
base_model = "C:\Jupyter\Model\Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = 0

In [6]:
model = LlamaForCausalLM.from_pretrained(
        base_model,
        torch_dtype=torch.float16,
        device_map="auto",
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
peft_model_id = r"C:\Jupyter\大模型\DoRA-main\commonsense_reasoning\dora"
model = PeftModel.from_pretrained(model, peft_model_id)

In [8]:
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_

In [9]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

                ### Instruction:
                {instruction}

                ### Input:
                {input}

                ### Response:
                """
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

                ### Instruction:
                {instruction}

                ### Response:
                """

In [35]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
instructions = [data.get('instruction',input) for data in dataset]
prompts = [generate_prompt(instruction) for instruction in instructions]
generation_config = GenerationConfig()
answers = []
batch_size = 32
for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True,truncation=True, max_length=256)
    input_ids = inputs["input_ids"].to(device)
    with torch.no_grad():
        generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                max_new_tokens=32,
            )
    outputs = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
    result = [o.split("### Response:")[-1].strip() for o in outputs]
    answers.extend(result)
    torch.cuda.empty_cache()

100%|██████████| 40/40 [01:14<00:00,  1.86s/it]


In [11]:
instructions = [data.get('instruction') for data in dataset]
prompts = [generate_prompt(instruction, input) for instruction in instructions]
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.7,
    top_p=0.75,
    top_k=40,
        )
answers = []

for question in tqdm(prompts):
    inputs = tokenizer(question, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"].to(device)

    with torch.no_grad():
        generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                max_new_tokens=32,
            )
    outputs = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
    result = [o.split("### Response:")[-1].strip() for o in outputs]
    print(result)
    answers.append(result)

  0%|          | 1/500 [00:01<09:17,  1.12s/it]

['the correct answer is answer2']


  0%|          | 2/500 [00:02<09:01,  1.09s/it]

['the correct answer is answer3']


  1%|          | 3/500 [00:03<08:53,  1.07s/it]

['the correct answer is answer3']


  1%|          | 4/500 [00:04<08:49,  1.07s/it]

['the correct answer is answer3']


  1%|          | 5/500 [00:05<08:48,  1.07s/it]

['the correct answer is answer3']


  1%|          | 6/500 [00:06<08:48,  1.07s/it]

['the correct answer is answer3']


  1%|▏         | 7/500 [00:07<08:52,  1.08s/it]

['the correct answer is answer2']





KeyboardInterrupt: 

In [41]:
answers

['the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option1',
 'the correct answer is option1',
 'the correct answer is option1',
 'the correct answer is option1',
 'the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option1',
 'the correct answer is option2',
 'the correct answer is option1',
 'the correct answer is option1',
 'the correct answer is option2',
 'the the correct answer is option1',
 'the correct answer is option1',
 'the correct answer is option2',
 'the correct answer is option1',
 'the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option2',
 'the correct answer is option1',
 'the correct answer is option2',
 'the correct answer is option1',
 'the correct answer is option1',
 'the the correct answer is option1',
 'the correct answer is option1',
 'the 

In [36]:
len(answers),len(dataset)

(1267, 1267)

In [37]:
def extract_answer(sentence: str, dataset="boolq"):
    sentence_ = sentence.strip()
    if dataset == 'boolq':
        
        pred_answers = re.findall(r'true|false', sentence_)
        if pred_answers:
            return pred_answers[0]
    # 你可以根据不同的数据集扩展这里的规则
        return ""
    elif dataset == 'piqa':
        if "Instruction:" in sentence_:
            pred_answer = re.findall(r'1|2', sentence_)
            if pred_answer:
                return "solution"+(pred_answer[-1])
        pred_answers = re.findall(r'solution1|solution2', sentence_)
        if pred_answers:
            return pred_answers[0]
        else:
            pred_answer = re.findall(r'1|2', sentence_)
            if pred_answer:
                return "solution"+(pred_answer[0])
        return ""
    elif dataset in ['social_i_qa', 'ARC-Challenge', 'ARC-Easy', 'openbookqa']:
        if "Instruction:" in sentence_:
            pred_answer = re.findall(r'1|2|3|4|5', sentence_)
            if pred_answer:
                return "answer"+(pred_answer[-1])
        pred_answers = re.findall(r'answer1|answer2|answer3|answer4|answer5', sentence_)
        if not pred_answers:
            pred_answer = re.findall(r'1|2|3|4|5', sentence_)
            if pred_answer:
                return "answer"+(pred_answer[0])
            return ""
        return pred_answers[0]
    elif dataset == 'hellaswag':
        if "Instruction:" in sentence_:
            pred_answer = re.findall(r'1|2|3|4', sentence_)
            if pred_answer:
                return "ending"+(pred_answer[-1])
            return ""
        pred_answers = re.findall(r'ending1|ending2|ending3|ending4', sentence_)
        if not pred_answers:
            pred_answer = re.findall(r'1|2|3|4', sentence_)
            if pred_answer:
                return "ending"+(pred_answer[0])
            return ""
        return pred_answers[0]
    elif dataset == 'winogrande':
        sentence_ = sentence.strip()
        pred_answers = re.findall(r'option1|option2', sentence_)
        if not pred_answers:
            pred_answer = re.findall(r'1|2', sentence_)
            if pred_answer:
                return "option"+(pred_answer[0])
            return ""
        return pred_answers[0]

In [38]:
output_data = []
for data, output in tqdm(zip(dataset, answers)):
    label = data.get('answer')
    predict = extract_answer(output,'winogrande')
    output_data.append({
        'instruction': data["instruction"],
        'output_pred': output,
        'pred': predict,
        'label': label,
        'flag': label == predict
    })
    

1267it [00:00, 633394.89it/s]


In [39]:
output_data[0]

{'instruction': 'Please choose the correct answer to fill in the blank to complete the given sentence: Sarah was a much better surgeon than Maria so _ always got the easier cases.\n\nOption1: Sarah Option2: Maria Answer format: option1/option2',
 'output_pred': 'the correct answer is option2',
 'pred': 'option2',
 'label': 'option2',
 'flag': True}

In [32]:
correct = sum(1 for item in output_data if item['flag'] == True)

In [28]:
# openbookqa
print(correct/len(dataset))

0.794


In [16]:
# ARC-Challenge
print(correct/len(dataset))

0.71160409556314


In [17]:
# ARC-Easy
print(correct/len(dataset))

0.8661616161616161


In [24]:
# winogrande
print(correct/len(dataset))

0.8224151539068666


In [54]:
# hellaswag
print(correct/len(dataset))

0.8897629954192392


In [39]:
# social_i_qa
print(correct/len(dataset))

0.8142272262026612


In [19]:
#piqa
print(correct/len(dataset))

0.8335146898803046


In [18]:
#BoolQ
print(correct/len(dataset))

0.7198776758409786


# dora16

In [55]:
# openbookqa
print(correct/len(dataset))

0.81


In [47]:
# ARC-Challenge
print(correct/len(dataset))

0.7218430034129693


In [39]:
# ARC-Easy
print(correct/len(dataset))

0.8674242424242424


In [29]:
# winogrande
print(correct/len(dataset))

0.8626677190213102


In [18]:
# hellaswag
print(correct/len(dataset))

0.890161322445728


In [40]:
# social_i_qa
print(correct/len(dataset))

0.8142272262026612


In [32]:
#piqa
print(correct/len(dataset))

0.8351468988030468


In [21]:
#BoolQ
print(correct/len(dataset))

0.7195718654434251


In [23]:
with open("evaluate/llama2_7B_Dora+/output_data_dora+_openbookqa.json", "w", encoding="utf-8") as file:
    json.dump(output_data, file, ensure_ascii=False, indent=4)