In [None]:
import os
import warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# warnings.filterwarnings('ignore')

In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from peft import PeftModel

In [None]:
model = LlamaForCausalLM.from_pretrained(
    '../llama-13b/',
    load_in_8bit=True,
    device_map='auto',
    torch_dtype=torch.float16
)

In [None]:
lora_model_path = '../models/GOAT_001_13B_Lora/'
model = PeftModel.from_pretrained(
    model, 
    lora_model_path,
    torch_dtype=torch.float16,
    device_map={'':0}
)

In [None]:
tokenizer = LlamaTokenizer.from_pretrained('../llama-13b/')

In [None]:
# copied from fastchat/train.py
def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
    """Resize tokenizer and embedding.
    如果更改了词表，则重新更改词表和tokenizer的词表尺寸，新添加的词表embedding
    用之前词表的embedding均值表示

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

# 2023.04.06 add pad token and resize embedding
smart_tokenizer_and_embedding_resize(
    special_tokens_dict=dict(pad_token='[PAD]'),
    tokenizer=tokenizer,
    model=model,
)
# add special tokens
add_token = "</s>"
tokenizer.add_special_tokens({
    "eos_token": add_token,
    "bos_token": add_token,
    "unk_token": add_token,
})

In [None]:
# 2023.04.04 用于有监督训练数据的处理
def generate_alpaca_prompt(example):
    '''
    生成中问alpaca类数据集的prompt
    '''
    if example['input']:
        source = f"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to human's question.\n### Human: {example['instruction']} {example['input']}\n### Assistant: "
        # source = f'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{example["instruction"]}\n\n### Input:\n{example["input"]}\n\n### Response:\n'
        target = f'{example["output"]}'
        return dict(example=(source + target, source))
    else:
        source = f"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to human's question.\n### Human: {example['instruction']}\n### Assistant: "
        # source = f'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{example["instruction"]}\n\n### Response:\n'
        target = f'{example["output"]}'
        return dict(example=(source + target, source))

In [None]:
text = {
    "instruction": "编辑以下句子并使其更自然。我想告诉你，我昨天晚上走了一条适合散步的小路。",
    "input": "",
    "output": ""
}
text = generate_alpaca_prompt(text)['example'][0]

In [None]:
inputs = tokenizer(text, return_tensors='pt')
input_ids = inputs['input_ids'].cuda()

In [None]:
from transformers import GenerationConfig
generation_config = GenerationConfig(
    temperature=0.7,
    top_p=0.95,
    top_k=50,
    num_beams=4,
    max_new_tokens=256,
    repetition_penalty=2.0
)
with torch.no_grad():
    preds = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
    )

In [None]:
output = tokenizer.batch_decode(preds)
output