###

### 模型下载到local

In [2]:
import torch
print("Torch version:", torch.__version__, torch.version.cuda)
print("Torch file:", torch.__file__)

# functorch/vmap 检查
print("Has torch.vmap:", hasattr(torch, "vmap"))
try:
    import torch._functorch.apis as apis
    print("has functorch.apis.vmap:", hasattr(apis, "vmap"))
except Exception as e:
    print("functorch.apis import error:", e)

import transformers
print("Transformers version:", transformers.__version__)
print("Transformers file:", transformers.__file__)


Torch version: 2.1.2+cu121 12.1
Torch file: /usr/local/lib/python3.10/dist-packages/torch/__init__.py
Has torch.vmap: True
has functorch.apis.vmap: True
Transformers version: 4.32.0
Transformers file: /usr/local/lib/python3.10/dist-packages/transformers/__init__.py


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"CUDNN version: {torch.backends.cudnn.version()}")


PyTorch version: 2.1.2+cu121
CUDA available: True
CUDA version: 12.1
CUDNN version: 8902


SyntaxError: invalid syntax (1618472228.py, line 1)

### 模型加载

In [5]:
#from modelscope import snapshot_download
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# Downloading model checkpoint to a local dir model_dir
# model_dir = snapshot_download('qwen/Qwen-VL')
# model_dir = snapshot_download(repo_id='qwen/Qwen-VL-Chat')
model_dir = "models/Qwen-VL"
# 确保本地模型代码能被 import
sys.path.append(model_dir)
# 可选：显式指定 endpoint 避免联网
os.environ['TRANSFORMERS_OFFLINE'] = '1'

# Loading local checkpoints
# trust_remote_code is still set as True since we still load codes from local dir instead of transformers
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    device_map="cuda",
    trust_remote_code=True
).eval()

print("Model and tokenizer loaded successfully from local path.")

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 10/10 [00:46<00:00,  4.65s/it]

Model and tokenizer loaded successfully from local path.





In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)

# Specify hyperparameters for generation
model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True)

### 对话

In [14]:
# 1st dialogue turn
query = tokenizer.from_list_format([
    {'image': 'N1_data/image_1.png'}, # Either a local path or an url
    {'text': '描述一下这个画面?'},
])
print("query:",query)
print("tokenizer:",tokenizer)
response, history = model.chat(tokenizer, query=query, history=None)
print(response)

query: Picture 1: <img>N1_data/image_1.png</img>
描述一下这个画面?
tokenizer: QWenTokenizer(name_or_path='/mnt/ali-sh-1/usr/yujing1/workspaces/MLLM/Qwen-VL-from-scratch/Qwen-VL/models/Qwen-VL', vocab_size=151860, model_max_length=8192, is_fast=False, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True)
这个画面中，有三个人，其中最显眼的是一个穿着黄色工作服、戴着VR眼镜的男子。这个男子的头发是棕色的，穿着黑色的裤子。画面的背景是一个房间，有两台电脑放在桌子上。


In [13]:
# 2nd dialogue turn
response, history = model.chat(tokenizer, '框出图中戴眼镜的人的人脸', history=history)
print(response)
# <ref>击掌</ref><box>(536,509),(588,602)</box>
image = tokenizer.draw_bbox_on_latest_picture(response, history)
if image:
  image.save('N1_data/image_1_box.jpg')
else:
  print("no box")

<ref>戴眼镜的人人脸</ref><box>(339,175),(607,375)</box>
