In [None]:
import os
import json
import torch
import numpy as np
import random
from tqdm.notebook import tqdm
from datasets import load_from_disk
from vllm import LLM, SamplingParams
import gc
from collections import Counter
import re
import string
import jieba
from fuzzywuzzy import fuzz
from rouge import Rouge

# Ensure only one GPU is used
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING']= "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [2]:
def seed_everything(seed, gpu_id=None):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if gpu_id is not None:
        torch.cuda.manual_seed(seed)
        torch.cuda.set_device(gpu_id)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
def get_pred(gpu_id, data_subset, model_path, out_path):
    seed_everything(42, gpu_id)
    sampling_params = SamplingParams(temperature=0.0, max_tokens=3000)
    llm = LLM(model=model_path, gpu_memory_utilization=0.9, dtype=torch.float16, device=torch.device(f'cuda:{gpu_id}'))  # Set dtype to float16 and specify GPU
    predictions = []
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    for num, json_obj in enumerate(data_subset):
        prompt = json_obj['input'] + ': ' + json_obj['context'] + ' .Выводи только аннотацию'
        print(prompt[:100])
        messages = [{"role": "user", "content": prompt[:8192]}]
        prompt = llm.llm_engine.tokenizer.tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
        outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
        for output in outputs:
            generated_text = output.outputs[0].text.strip()
            result = {
                "pred_clust": generated_text,
                "answers": json_obj["answers"],
                "all_classes": json_obj["all_classes"],
                "length": json_obj["length"]
            }
            predictions.append(result)
            with open(out_path, "a", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False)
                f.write('\n')
        # Free memory after each generation
        print(f'GENERATION NUM {num} : {generated_text[:100]}')
        del prompt, messages, outputs, generated_text
        torch.cuda.empty_cache()
        gc.collect()
    return predictions

if __name__ == "__main__":
    seed_everything(42)

    model_path = "/home/asperekhodov/LLaMA-Factory/suzume_lora_merged"
    
    dataset = "QasperSumInstructLONGBENCH"
    data = load_from_disk("QasperSUM.hf")
    path = 'pred_clust'
    out_path = f"{path}/{dataset}.jsonl"
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    data_all = [data_sample for data_sample in data]

    predictions = get_pred(0, data_all, model_path, out_path)

    evaluate_predictions(path)
    print('All done')


INFO 05-26 17:29:18 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='/home/asperekhodov/LLaMA-Factory/suzume_lora_merged', speculative_config=None, tokenizer='/home/asperekhodov/LLaMA-Factory/suzume_lora_merged', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/home/asperekhodov/LLaMA-Factory/suzume_lora_merged)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-26 17:29:19 utils.py:660] Found nccl from library /home/asperekhodov/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-26 17:29:22 selector.py:69] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 05-26 17:29:22 selector.py:32] Using XFormers backend.
INFO 05-26 17:30:26 model_runner.py:175] Loading model weights took 14.9595 GB
INFO 05-26 17:30:29 gpu_executor.py:114] # GPU blocks: 5643, # CPU blocks: 2048
INFO 05-26 17:30:31 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-26 17:30:31 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-26 17:30:39 model_runner.py:1017] Graph

# SKIP BELOW

In [None]:
# def normalize_answer(s):
#     """Lower text and remove punctuation, articles and extra whitespace."""

#     def remove_articles(text):
#         return re.sub(r"\b(a|an|the)\b", " ", text)

#     def white_space_fix(text):
#         return " ".join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return "".join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_articles(remove_punc(lower(s))))

# def normalize_zh_answer(s):
#     """Lower text and remove punctuation, extra whitespace."""

#     def white_space_fix(text):
#         return "".join(text.split())

#     def remove_punc(text):
#         cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
#         all_punctuation = set(string.punctuation + cn_punctuation)
#         return "".join(ch for ch in text if ch not in all_punctuation)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_punc(lower(s)))

# def count_score(prediction, ground_truth, **kwargs):
#     numbers = re.findall(r"\d+", prediction)
#     right_num = 0
#     for number in numbers:
#         if str(number) == str(ground_truth):
#             right_num += 1
#     final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
#     return float(final_score)

# def retrieval_score(prediction, ground_truth, **kwargs):
#     pattern = r'Paragraph (\d+)'
#     matches = re.findall(pattern, ground_truth)
#     ground_truth_id = matches[0]
#     numbers = re.findall(r"\d+", prediction)
#     right_num = 0
#     for number in numbers:
#         if str(number) == str(ground_truth_id):
#             right_num += 1
#     final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
#     return float(final_score)

# def retrieval_zh_score(prediction, ground_truth, **kwargs):
#     pattern = r'段落(\d+)'
#     matches = re.findall(pattern, ground_truth)
#     ground_truth_id = matches[0]
#     numbers = re.findall(r"\d+", prediction)
#     right_num = 0
#     for number in numbers:
#         if str(number) == str(ground_truth_id):
#             right_num += 1
#     final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
#     return float(final_score)

# def code_sim_score(prediction, ground_truth, **kwargs):
#     all_lines = prediction.lstrip('\n').split('\n')
#     prediction = ""
#     for line in all_lines:
#         if ('`' not in line) and ('#' not in line) and ('//' not in line):
#             prediction = line
#             break
#     return (fuzz.ratio(prediction, ground_truth) / 100)

# def classification_score(prediction, ground_truth, **kwargs):
#     em_match_list = []
#     all_classes = kwargs["all_classes"]
#     for class_name in all_classes:
#         if class_name in prediction:
#             em_match_list.append(class_name)
#     for match_term in em_match_list:
#         if match_term in ground_truth and match_term != ground_truth:
#             em_match_list.remove(match_term)
#     if ground_truth in em_match_list:
#         score = (1.0 / len(em_match_list))
#     else:
#         score = 0.0
#     return score
    
# def rouge_score(prediction, ground_truth, **kwargs):
#     rouge = Rouge()
#     try:
#         scores = rouge.get_scores([prediction], [ground_truth], avg=True)
#     except:
#         return 0.0
#     return scores["rouge-l"]["f"]

# def rouge_zh_score(prediction, ground_truth, **kwargs):
#     prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
#     ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) 
#     score = rouge_score(prediction, ground_truth)
#     return score

# def f1_score(prediction, ground_truth, **kwargs):
#     common = Counter(prediction) & Counter(ground_truth)
#     num_same = sum(common.values())
#     if num_same == 0:
#         return 0
#     precision = 1.0 * num_same / len(prediction)
#     recall = 1.0 * num_same / len(ground_truth)
#     f1 = (2 * precision * recall) / (precision + recall)
#     return f1

# def qa_f1_score(prediction, ground_truth, **kwargs):
#     normalized_prediction = normalize_answer(prediction)
#     normalized_ground_truth = normalize_answer(ground_truth)

#     prediction_tokens = normalized_prediction.split()
#     ground_truth_tokens = normalized_ground_truth.split()
#     return f1_score(prediction_tokens, ground_truth_tokens)

# def qa_f1_zh_score(prediction, ground_truth, **kwargs):
#     prediction_tokens = list(jieba.cut(prediction, cut_all=False))
#     ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
#     prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
#     ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
#     prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
#     ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
#     return f1_score(prediction_tokens, ground_truth_tokens)

# dataset2metric = {
#     "qasper": qa_f1_score,
#     "QasperSumInstructLONGBENCH": rouge_score
# }

# def scorer_e(dataset, predictions, answers, lengths, all_classes):
#     scores = {"0-4k": [], "4-8k": [], "8k+": []}
#     for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
#         score = 0.
#         if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
#             prediction = prediction.lstrip('\n').split('\n')[0]
#         for ground_truth in ground_truths:
#             score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
#         if length < 4000:
#             scores["0-4k"].append(score)
#         elif length < 8000:
#             scores["4-8k"].append(score)
#         else:
#             scores["8k+"].append(score)
#     for key in scores.keys():
#         scores[key] = round(100 * np.mean(scores[key]), 2)
#     return scores

# def scorer(dataset, predictions, answers, all_classes):
#     total_score = 0.
#     for (prediction, ground_truths) in zip(predictions, answers):
#         score = 0.
#         if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
#             prediction = prediction.lstrip('\n').split('\n')[0]
#         for ground_truth in ground_truths:
#             score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
#         total_score += score
#     return round(100 * total_score / len(predictions), 2)

# def evaluate_predictions(path, e_flag=False):
#     scores = dict()
#     all_files = [file for file in os.listdir(path) if file.endswith("jsonl") is True]
#     print("Evaluating on:", all_files)
#     for filename in all_files:
#         predictions, answers, lengths = [], [], []
#         dataset = filename.split('.')[0]
#         with open(f"{path}/{filename}", "r", encoding="utf-8") as f:
#             for line in f:
#                 data = json.loads(line)
#                 predictions.append(data["pred_clust"])
#                 answers.append(data["answers"])
#                 all_classes = data["all_classes"]
#                 if "length" in data:
#                     lengths.append(data["length"])
#         if e_flag:
#             score = scorer_e(dataset, predictions, answers, lengths, all_classes)
#         else:
#             score = scorer(dataset, predictions, answers, all_classes)
#             print(score)
#         scores[dataset] = score
#     result_path = f"{path}/result.json"
#     with open(result_path, "w") as f:
#         json.dump(scores, f, ensure_ascii=False, indent=4)
#     print('Evaluation done. Results saved to', result_path)


In [3]:
dataset2metric = {
    "qasper": qa_f1_score,
    "QasperSumInstructLONGBENCH": rouge_score
}
def evaluate_predictions(path, e_flag=False):
    scores = dict()
    all_files = [file for file in os.listdir(path) if file.endswith("jsonl") is True]
    print("Evaluating on:", all_files)
    for filename in all_files:
        predictions, answers, lengths = [], [], []
        dataset = filename.split('.')[0]
        with open(f"{path}/{filename}", "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                predictions.append(data["pred_clust"])
                answers.append(data["answers"])
                all_classes = data["all_classes"]
                if "length" in data:
                    lengths.append(data["length"])
        if e_flag:
            score = scorer_e(dataset, predictions, answers, lengths, all_classes)
        else:
            score = scorer(dataset, predictions, answers, all_classes)
            print(score)
        scores[dataset] = score
    result_path = f"{path}/result.json"
    with open(result_path, "w") as f:
        json.dump(scores, f, ensure_ascii=False, indent=4)
    print('Evaluation done. Results saved to', result_path)

evaluate_predictions(path)

Evaluating on: ['QasperSumInstructLONGBENCH.jsonl']
0.16
Evaluation done. Results saved to pred_clust/result.json


In [2]:
def seed_everything(seed, gpu_id=None):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if gpu_id is not None:
        torch.cuda.manual_seed(seed)
        torch.cuda.set_device(gpu_id)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

def get_pred(gpu_id, data_subset, model_path, out_path):
    seed_everything(42, gpu_id)
    sampling_params = SamplingParams(temperature=0.0, max_tokens=3000)
    llm = LLM(model=model_path, dtype=torch.float16, device=torch.device(f'cuda:{gpu_id}'), gpu_memory_utilization=0.9)  # Set dtype to float16 and specify GPU
    predictions = []
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    for num, json_obj in enumerate(data_subset):
        prompt = json_obj['input'] + ': ' + json_obj['context'] + ' . Выводи только ответ и отдельно доказательство.'
        print(prompt[:100])
        messages = [{"role": "user", "content": prompt[:8192]}]
        prompt = llm.llm_engine.tokenizer.tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
        outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
        for output in outputs:
            generated_text = output.outputs[0].text.strip()
            result = {
                "pred_clust": generated_text,
                "answers": json_obj["answers"],
                "all_classes": json_obj["all_classes"],
                "length": json_obj["length"]
            }
            predictions.append(result)
            with open(out_path, "a", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False)
                f.write('\n')
        # Free memory after each generation
        print(f'GENERATION NUM {num} : {generated_text[:100]}')
        del prompt, messages, outputs, generated_text
        torch.cuda.empty_cache()
        gc.collect()
    return predictions

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def normalize_zh_answer(s):
    """Lower text and remove punctuation, extra whitespace."""

    def white_space_fix(text):
        return "".join(text.split())

    def remove_punc(text):
        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
        all_punctuation = set(string.punctuation + cn_punctuation)
        return "".join(ch for ch in text if ch not in all_punctuation)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(s)))

def count_score(prediction, ground_truth, **kwargs):
    numbers = re.findall(r"\d+", prediction)
    right_num = 0
    for number in numbers:
        if str(number) == str(ground_truth):
            right_num += 1
    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
    return float(final_score)

def retrieval_score(prediction, ground_truth, **kwargs):
    pattern = r'Paragraph (\d+)'
    matches = re.findall(pattern, ground_truth)
    ground_truth_id = matches[0]
    numbers = re.findall(r"\d+", prediction)
    right_num = 0
    for number in numbers:
        if str(number) == str(ground_truth_id):
            right_num += 1
    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
    return float(final_score)

def retrieval_zh_score(prediction, ground_truth, **kwargs):
    pattern = r'段落(\d+)'
    matches = re.findall(pattern, ground_truth)
    ground_truth_id = matches[0]
    numbers = re.findall(r"\d+", prediction)
    right_num = 0
    for number in numbers:
        if str(number) == str(ground_truth_id):
            right_num += 1
    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
    return float(final_score)

def code_sim_score(prediction, ground_truth, **kwargs):
    all_lines = prediction.lstrip('\n').split('\n')
    prediction = ""
    for line in all_lines:
        if ('`' not in line) and ('#' not in line) and ('//' not in line):
            prediction = line
            break
    return (fuzz.ratio(prediction, ground_truth) / 100)

def classification_score(prediction, ground_truth, **kwargs):
    em_match_list = []
    all_classes = kwargs["all_classes"]
    for class_name in all_classes:
        if class_name in prediction:
            em_match_list.append(class_name)
    for match_term in em_match_list:
        if match_term in ground_truth and match_term != ground_truth:
            em_match_list.remove(match_term)
    if ground_truth in em_match_list:
        score = (1.0 / len(em_match_list))
    else:
        score = 0.0
    return score
    
def rouge_score(prediction, ground_truth, **kwargs):
    rouge = Rouge()
    try:
        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
    except:
        return 0.0
    return scores["rouge-l"]["f"]

def rouge_zh_score(prediction, ground_truth, **kwargs):
    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) 
    score = rouge_score(prediction, ground_truth)
    return score

def f1_score(prediction, ground_truth, **kwargs):
    common = Counter(prediction) & Counter(ground_truth)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction)
    recall = 1.0 * num_same / len(ground_truth)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def qa_f1_score(prediction, ground_truth, **kwargs):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    return f1_score(prediction_tokens, ground_truth_tokens)

def qa_f1_zh_score(prediction, ground_truth, **kwargs):
    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
    return f1_score(prediction_tokens, ground_truth_tokens)

dataset2metric = {
    "qasper": qa_f1_score,
    "QasperQAInstructLONGBENCH": qa_f1_score
}

def scorer_e(dataset, predictions, answers, lengths, all_classes):
    scores = {"0-4k": [], "4-8k": [], "8k+": []}
    for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
        score = 0.
        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
            prediction = prediction.lstrip('\n').split('\n')[0]
        for ground_truth in ground_truths:
            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
        if length < 4000:
            scores["0-4k"].append(score)
        elif length < 8000:
            scores["4-8k"].append(score)
        else:
            scores["8k+"].append(score)
    for key in scores.keys():
        scores[key] = round(100 * np.mean(scores[key]), 2)
    return scores

def scorer(dataset, predictions, answers, all_classes):
    total_score = 0.
    for (prediction, ground_truths) in zip(predictions, answers):
        score = 0.
        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
            prediction = prediction.lstrip('\n').split('\n')[0]
        for ground_truth in ground_truths:
            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
        total_score += score
    return round(100 * total_score / len(predictions), 2)

def evaluate_predictions(path, e_flag=False):
    scores = dict()
    all_files = [file for file in os.listdir(path) if file.endswith("jsonl") is True]
    print("Evaluating on:", all_files)
    for filename in all_files:
        predictions, answers, lengths = [], [], []
        dataset = filename.split('.')[0]
        with open(f"{path}/{filename}", "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                predictions.append(data["pred_clust"])
                answers.append(data["answers"])
                all_classes = data["all_classes"]
                if "length" in data:
                    lengths.append(data["length"])
        if e_flag:
            score = scorer_e(dataset, predictions, answers, lengths, all_classes)
        else:
            score = scorer(dataset, predictions, answers, all_classes)
            print(score)
        scores[dataset] = score
    result_path = f"{path}/result.json"
    with open(result_path, "w") as f:
        json.dump(scores, f, ensure_ascii=False, indent=4)
    print('Evaluation done. Results saved to', result_path)

if __name__ == "__main__":
    seed_everything(42)

    model_path = "/home/asperekhodov/LLaMA-Factory/suzume_lora_merged"

    dataset = "QasperQAInstructLONGBENCH"
    data = load_from_disk("QasperQA.hf")
    path = 'pred_clust'
    out_path = f"{path}/{dataset}.jsonl"
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    data_all = [data_sample for data_sample in data]

    predictions = get_pred(0, data_all, model_path, out_path)

    evaluate_predictions(path)
    print('All done')


INFO 05-26 09:23:37 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='/home/asperekhodov/.cache/huggingface/hub/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/a91a26333d22b2382025e4e1f5a7869142a683c3', speculative_config=None, tokenizer='/home/asperekhodov/.cache/huggingface/hub/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/a91a26333d22b2382025e4e1f5a7869142a683c3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/home/asperekhodov/.cache/huggingface/hub/models--lightblue--suzume-llama-3-8B-multilingual/snapshots/a91a26333d22b

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-26 09:23:38 utils.py:660] Found nccl from library /home/asperekhodov/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-26 09:24:02 selector.py:69] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 05-26 09:24:02 selector.py:32] Using XFormers backend.
INFO 05-26 09:26:35 model_runner.py:175] Loading model weights took 14.9595 GB
INFO 05-26 09:26:41 gpu_executor.py:114] # GPU blocks: 5643, # CPU blocks: 2048
INFO 05-26 09:26:43 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-26 09:26:43 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-26 09:26:51 model_runner.py:1017] Graph

In [3]:
lol

NameError: name 'lol' is not defined