In [1]:
import os
from modelscope import snapshot_download, AutoModel, AutoTokenizer
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

gpu = 7


class LLaMA3_LLM(LLM):
    # 基于本地 llama3 自定义 LLM 类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, mode_name_or_path: str):
        super().__init__()
        # print("正在从本地加载模型...")
        device = torch.device(f'cuda:{gpu}')
        self.tokenizer = AutoTokenizer.from_pretrained(
            mode_name_or_path, use_fast=False)
        self.model = AutoModelForCausalLM.from_pretrained(
            mode_name_or_path, torch_dtype=torch.bfloat16).to(device)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        # print(f"模型加载到设备: {device}")
        # print("完成本地模型的加载")

    def bulid_input(self, prompt, history=[]):
        user_format = 'user\n\n{content}'
        assistant_format = 'assistant\n\n{content}'
        history.append({'role': 'user', 'content': prompt})
        prompt_str = ''
        # 拼接历史对话
        for item in history:
            if item['role'] == 'user':
                prompt_str += user_format.format(content=item['content'])
            else:
                prompt_str += assistant_format.format(content=item['content'])
        return prompt_str

    def _call(self, prompt: str, stop: Optional[List[str]] = None,
              run_manager: Optional[CallbackManagerForLLMRun] = None,
              **kwargs: Any):
        device = torch.device(f'cuda:{gpu}')
        # input_str = self.bulid_input(prompt=prompt)
        input_str = prompt
        input_ids = self.tokenizer.encode(
            input_str, add_special_tokens=False, return_tensors='pt'
        ).to(device)

        # print(f"输入张量加载到设备: {input_ids.device}")
        # print(f"模型所在设备: {self.model.device}")

        outputs = self.model.generate(
            input_ids=input_ids, 
            max_new_tokens=512, 
            do_sample=True,
            top_p=0.9, 
            temperature=0.5, 
            repetition_penalty=1.1, 
            eos_token_id=self.tokenizer.encode('')[0]
        )

        outputs = outputs.tolist()[0][len(input_ids[0]):]
        response = self.tokenizer.decode(outputs).strip().replace('', "").strip()
        return response

    @property
    def _llm_type(self) -> str:
        return "LLaMA3_LLM"


model_dir = snapshot_download('LLM-Research/Meta-Llama-3-8B',
                              cache_dir='/datasets/zhouyufan/modelscope', revision='master')


llm = LLaMA3_LLM(
    mode_name_or_path="/datasets/zhouyufan/modelscope/LLM-Research/Meta-Llama-3-8B-Instruct")

2024-08-20 23:21:36,378 - modelscope - INFO - PyTorch version 2.3.1 Found.
2024-08-20 23:21:36,380 - modelscope - INFO - Loading ast index from /home/zhouyufan/.cache/modelscope/ast_indexer
2024-08-20 23:21:36,416 - modelscope - INFO - Loading done! Current index file version is 1.11.0, with md5 7c57b3ab5796d2ca43c2935f7a0b61c8 and a total number of 953 components indexed
  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


In [2]:
def get_last_line(text):
    # 使用 splitlines() 方法将字符串按行分割，然后获取最后一行
    lines = text.splitlines()
    return lines[-1] if lines else ''
def remove_trailing_char(input_str, char_to_remove):
    return input_str.rstrip(char_to_remove)

title = 'Make Jello Shots'
true_action = 'pour jello powder,pour alcohol,stir mixture'
old_action = 'pour jello powder,stir mixture,pour alcohol'
user_text = (
    f"I need help sorting actions in a specific task. "
    f"For example, if there is a task about frying eggs with three actions, "
    f"which are 'stir fry eggs', 'beat eggs', 'put eggs in the pan', "
    f"the correct order would be: 'beat eggs', 'put eggs in the pan', 'stir fry eggs'. "
    f"Now, I have a task related to {title}. "
    f"I have a sequence of actions: {old_action}. "
    f"Please sort these actions chronologically and logically, "
    f"and provide the correct order in a single line, with each action separated by a comma, "
    f"without adding any extra actions, punctuation, or sentences."
)



response = llm('Do you know China?')

# 获取模型回复
print('question:\n'+user_text)

# 打印回复
print('answer:\n'+response)

print('last line:\n'+get_last_line(response))
# 保存回复到文件
with open('response.txt', 'w') as file:
    file.write(remove_trailing_char(response,'<|end_of_text|><|begin_of_text|>'))



  warn_deprecated(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128000 for open-end generation.


question:
I need help sorting actions in a specific task. For example, if there is a task about frying eggs with three actions, which are 'stir fry eggs', 'beat eggs', 'put eggs in the pan', the correct order would be: 'beat eggs', 'put eggs in the pan', 'stir fry eggs'. Now, I have a task related to Make Jello Shots. I have a sequence of actions: pour jello powder,stir mixture,pour alcohol. Please sort these actions chronologically and logically, and provide the correct order in a single line, with each action separated by a comma, without adding any extra actions, punctuation, or sentences.
answer:
I have a friend who is going there for work. He's been to Beijing and Shanghai, but he wants to see more of the country. Do you have any recommendations?

I'm a bit jealous of your trip! But I'd love to help your friend plan his next adventure in China.

China is a vast and diverse country with a rich history and culture. There are many amazing places to visit beyond Beijing and Shanghai. 

In [1]:
import transformers
import torch

model_id = "/data/zhaobo/zhouyufan/PDPP-Optimize/LLM-Research/Meta-Llama-3-8B-Instruct"

# 指定使用 GPU 0
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map={"": 4}  
)

response = pipeline("Hey how are you doing today?")
print(response)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:  50%|█████     | 2/4 [22:07<22:07, 663.82s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU  has a total capacity of 23.65 GiB of which 69.38 MiB is free. Process 1035172 has 1.51 GiB memory in use. Process 2649739 has 1.54 GiB memory in use. Process 2649735 has 1.60 GiB memory in use. Process 2989196 has 6.06 GiB memory in use. Including non-PyTorch memory, this process has 12.84 GiB memory in use. Of the allocated memory 12.35 GiB is allocated by PyTorch, and 113.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)