# glm-4-9b-chat + vllm 部署 + 直接调用 
- 速度快，显存在用在 17.5635 GB （vllm）

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from modelscope import AutoTokenizer
from vllm import LLM, SamplingParams
from modelscope import snapshot_download

# GLM-4-9B-Chat
max_model_len, tp_size = 8192, 1
model_name = '/mntdata/wangql43/A000Files/A003Model/ZhipuAI/glm-4-9b-chat/'
prompt = '你好'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = LLM(
    model=model_name,
    tensor_parallel_size=tp_size,
    max_model_len=max_model_len,
    trust_remote_code=True,
    enforce_eager=True,
    gpu_memory_utilization=0.8, 
    # GLM-4-9B-Chat-1M 如果遇见 OOM 现象，建议开启下述参数
    # enable_chunked_prefill=True,
    # max_num_batched_tokens=8192
)

2024-06-05 07:06:56,841 - modelscope - INFO - PyTorch version 2.3.0 Found.
2024-06-05 07:06:56,847 - modelscope - INFO - TensorFlow version 2.8.4 Found.
2024-06-05 07:06:56,848 - modelscope - INFO - Loading ast index from /home/wangql43/.cache/modelscope/ast_indexer
2024-06-05 07:06:56,932 - modelscope - INFO - Loading done! Current index file version is 1.13.3, with md5 ad9a4e0d356e8d55478b76171d3ca60c and a total number of 972 components indexed
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-05 07:07:02 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='/mntdata/wangql43/A000Files/A003Model/ZhipuAI/glm-4-9b-chat/', speculative_config=None, tokenizer='/mntdata/wangql43/A000Files/A003Model/ZhipuAI/glm-4-9b-chat/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/mntdata/wangql43/A000Files/A003Model/ZhipuAI/glm-4-9b-chat/)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-05 07:07:03 utils.py:660] Found nccl from library /home/wangql43/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 06-05 07:07:03 selector.py:27] Using FlashAttention-2 backend.
INFO 06-05 07:07:50 model_runner.py:175] Loading model weights took 17.5635 GB
INFO 06-05 07:07:51 gpu_executor.py:114] # GPU blocks: 71280, # CPU blocks: 6553


In [2]:
stop_token_ids = [151329, 151336, 151338]
sampling_params = SamplingParams(
                                temperature=0.45,
                                max_tokens=2048,
                                top_p=0.7,
                                n=1,
                                stop_token_ids=stop_token_ids
)

inputs = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], add_generation_prompt=True)[0]
outputs = llm.generate(prompt_token_ids=[inputs], sampling_params=sampling_params)

generated_text = [output.outputs[0].text for output in outputs]
print(generated_text)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.29it/s]

['\n你好👋！很高兴见到你，有什么可以帮助你的吗？']





In [3]:
def generate_response(tokenizer, llm, sampling_params, prompt): 
    inputs = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], add_generation_prompt=True)[0]
    outputs = llm.generate(prompt_token_ids=[inputs], sampling_params=sampling_params)

    generated_text = [output.outputs[0].text for output in outputs]
    return generated_text

# Lib

In [4]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

import glob
import pandas as pd
import json
import re 

import random 
random.seed(0)

## 读取数据

In [5]:
## 读取数据的内容 
def read_train_valid_test(path): 
    data = []
    
    assert path.endswith('.txt')
    # 打开并逐行读取txt文件
    with open(path, 'r') as f:
        for line in f:
            # 使用json.loads将每一行转换为字典
            data.append(json.loads(line))
            
    ## 转换成 df 格式 
    data = pd.DataFrame(data)
    return data 

def read_json_to_df(json_path): 
    # 打开json文件
    with open(json_path, 'r') as file:
        # 解析json文件
        data = json.load(file)
    ## json --> df 
    data = pd.DataFrame(data).T.reset_index(names=['pids'])
    return data

## 清洗数据 
def clean_body_remove_symbol(text): 
    ## clean_body_remove_symbol(text) 
    text = re.sub('<[^<]+?>', ' ', text).replace('\n', '').strip()
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('http://', '').replace('https://', '').replace('.com', '').replace('.cn', '')
    return text 

## data
trpath = 'data/AQA/qa_train.txt'
train = read_train_valid_test(trpath)

valpath = 'data/AQA/qa_valid_wo_ans.txt'
valid = read_train_valid_test(valpath)

testpath = 'data/AQA-test-public/qa_test_wo_ans_new.txt'
test = read_train_valid_test(testpath)

## json  
json_path = 'data/AQA/pid_to_title_abs_new.json'
df_json_old = read_json_to_df(json_path) 

json_path = 'data/AQA-test-public/pid_to_title_abs_update_filter.json'
df_json_new = read_json_to_df(json_path) 

len(set(df_json_new['pids']).difference(set(df_json_old['pids'])))
df_json = pd.merge(df_json_new, df_json_old, how='outer', on=['pids', 'title', 'abstract'])

del df_json_new, df_json_old

##
train.isnull().sum()
valid.isnull().sum()
df_json.isnull().sum()

## train 中的 body 内容给定 
train['body'] = train['body'].apply(clean_body_remove_symbol)
valid['body'] = valid['body'].apply(clean_body_remove_symbol)
test['body'] = test['body'].apply(clean_body_remove_symbol)

## passage 文章清洗 
df_json['title'] = df_json['title'].fillna('None').apply(clean_body_remove_symbol) 
df_json['abstract'] = df_json['abstract'].apply(clean_body_remove_symbol) 

df_json = df_json.reset_index()
df_json = df_json.rename(columns={'index':'id'}) 

70575

question    0
body        0
pids        0
dtype: int64

question    0
body        0
dtype: int64

pids        0
title       3
abstract    0
dtype: int64

## 生成 keywords 

In [6]:
from tqdm import tqdm
from langchain.prompts import PromptTemplate

In [7]:
keywords_template = '''# CONTEXT #
user's question: {question}
context information: {body}

#############

# OBJECTIVE #
You are an outstanding technical expert in the field of research paper/document Q&A, with particular expertise in SCI (Science Citation Index), EI (The Engineering Index), IEEE (The Institute of Electrical and Electronics Engineers), Science and etc.

Extract keywords from the user's question and context information.

Attention: The requirement is to output the keywords only !!!

#############

# STYLE #
Science, Technology, Engineering, Mathematics, 
physics, chemistry, atmospheric science, biology
Natural Language Processing, Machine Learning, Deep Learning
Computer Science, Materials, mechanical engineering, electrical and electronic engineering, telecommunications, instrumentation, systems engineering

#############

# TONE #
Keywords, Professional, Serious, Responsible, Accurate, and Precise.

#############'''

In [8]:
row = test.loc[0] 
keywords_prompt_template = PromptTemplate(input_variables=["question", 'body'], template=keywords_template)  
keywords_formatted_prompt = keywords_prompt_template.format(question=row.question, body=row.body) 

In [9]:
for idx, row in tqdm(test.iterrows(), total=len(test)): 
    ## 没有映射走模型  
    keywords_formatted_prompt = keywords_prompt_template.format(question=row.question, body=row.body) 
    response = generate_response(tokenizer, llm, sampling_params, keywords_formatted_prompt)[0]
    test.loc[idx, 'keywords'] = response.replace('Keywords', '').replace('keywords', '').replace(':', '') 


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s][A
 96%|█████████▋| 2889/3000 [46:43<01:43,  1.07it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s][A
 96%|█████████▋| 2890/3000 [46:43<01:42,  1.08it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s][A
 96%|█████████▋| 2891/3000 [46:44<01:33,  1.17it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s][A
 96%|█████████▋| 2892/3000 [46:45<01:33,  1.15it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.36s/it][A
 96%|█████████▋| 2893/3000 [46:46<01:49,  1.02s/it]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s][A
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it][A
 96%|█████████▋| 2894/3000 [46:

In [10]:
test.to_parquet('outslgb/test_with_aiResponse_withKeywords_by_glm9bVllmPost.parquet', index=False) 