In [1]:
import json
import re

def extract_json_from_string(input_string):
    """
    Extract JSON-like content from a string, handling extra or mismatched braces, 
    and parse it into a Python dictionary.

    Args:
    input_string (str): The string containing JSON-like content.

    Returns:
    dict: Parsed JSON content as a dictionary, or an empty dictionary if parsing fails.
    """
    try:
        # Use regex to find JSON-like content
        json_pattern = re.compile(r'{.*?}', re.DOTALL)
        match = json_pattern.search(input_string)

        if match:
            # Handle potential double braces
            json_content = match.group()
            json_content = json_content.replace('{{', '{').replace('}}', '}')
            
            # Try parsing the adjusted JSON content
            return json.loads(json_content)
    except json.JSONDecodeError:
        pass

    # Return empty dictionary if no valid JSON is found
    return {}

# Example usage
input_string_1 = """
Some random text
{
    "issue_type": "other"
}
More text here
"""

input_string_2 = """
Some text with extra braces {{
    "issue_type": "error"
}}
End of text
"""

parsed_json_1 = extract_json_from_string(input_string_1)
parsed_json_2 = extract_json_from_string(input_string_2)

parsed_json_1, parsed_json_2

({'issue_type': 'other'}, {'issue_type': 'error'})

In [2]:
# BASE_MODEL_DIR = "/data/luomingkai/issue/models/Qwen/Qwen2.5-7B-Instruct"
BASE_MODEL_DIR = "/root/autodl-tmp/Qwen2.5-14B-Instruct"

In [3]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR)

# Pass the default decoding hyperparameters of Qwen2.5-7B-Instruct
# max_tokens is for the maximum length for generation.
sampling_params = SamplingParams(temperature=0.5, top_p=1.0, repetition_penalty=1.05, max_tokens=512)

# Input the model name or path. Can be GPTQ or AWQ models.
model = LLM(model=BASE_MODEL_DIR, tensor_parallel_size=2)

# Prepare your prompts
prompt = "Tell me something about large language models."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# generate outputs
outputs = model.generate([text], sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


INFO 12-27 23:18:12 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 12-27 23:18:12 config.py:1020] Defaulting to use mp for distributed inference
INFO 12-27 23:18:12 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/root/autodl-tmp/Qwen2.5-14B-Instruct', speculative_config=None, tokenizer='/root/autodl-tmp/Qwen2.5-14B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_f

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=1242)[0;0m INFO 12-27 23:18:41 model_runner.py:1077] Loading model weights took 13.9280 GB
INFO 12-27 23:18:42 model_runner.py:1077] Loading model weights took 13.9280 GB
[1;36m(VllmWorkerProcess pid=1242)[0;0m INFO 12-27 23:18:51 worker.py:232] Memory profiling results: total_gpu_memory=31.60GiB initial_memory_usage=14.34GiB peak_torch_memory=16.45GiB memory_usage_post_profile=14.38GiB non_torch_memory=0.45GiB kv_cache_size=11.54GiB gpu_memory_utilization=0.90
INFO 12-27 23:18:51 worker.py:232] Memory profiling results: total_gpu_memory=31.60GiB initial_memory_usage=14.34GiB peak_torch_memory=16.45GiB memory_usage_post_profile=14.38GiB non_torch_memory=0.45GiB kv_cache_size=11.54GiB gpu_memory_utilization=0.90
INFO 12-27 23:18:51 distributed_gpu_executor.py:57] # GPU blocks: 7878, # CPU blocks: 2730
INFO 12-27 23:18:51 distributed_gpu_executor.py:61] Maximum concurrency for 32768 tokens per request: 3.85x
INFO 12-27 23:18:54 model_runner.py:1400] Captu

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.12s/it, est. speed input: 7.23 toks/s, output: 40.45 toks/s]

Prompt: '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nTell me something about large language models.<|im_end|>\n<|im_start|>assistant\n', Generated text: "Large language models, like myself, are sophisticated artificial intelligence systems designed to understand and generate human-like text based on the data they've been trained on. These models are typically characterized by their enormous size, often containing billions of parameters, which are the settings that the model adjusts as it learns from the training data.\n\nOne of the key features of large language models is their ability to capture complex patterns in natural language, allowing them to perform a wide range of tasks such as answering questions, generating stories, translating languages, summarizing documents, and even engaging in dialogue. This versatility arises from the vast amount of textual data used during training, which helps the model learn 




In [4]:
def get_qwen_output(
    model,
    tokenizer,
    messages_list,
    max_input_length=4096,
    max_tokens=512,
):
    text_list = []
    for messages in messages_list:
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        text_list.append(text)
        
    # sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
    # sampling_params = SamplingParams(temperature=0.5, top_p=1.0, repetition_penalty=1.05, max_tokens=512)
    sampling_params = SamplingParams(temperature=0.3, top_p=1.0, repetition_penalty=1.05, max_tokens=max_tokens)
    

    outputs = model.generate(text_list, sampling_params)
    
    # Print the outputs.
    responses = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        # print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        responses.append(generated_text)

    return responses

messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": "Give me a short introduction to large language model."}
]
get_qwen_output(model, tokenizer, [messages])

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.33s/it, est. speed input: 11.71 toks/s, output: 40.82 toks/s]


["A large language model, like the one you're interacting with now, is an artificial intelligence system designed to process and generate human-like text based on the input it receives. These models are trained on vast amounts of textual data from the internet and other sources, allowing them to understand and generate text in various styles and contexts. They can be used for a wide range of applications, including but not limited to, answering questions, generating stories, translating languages, writing code, and providing information on a variety of topics. The key to their effectiveness lies in their ability to learn patterns and structures from the data they are trained on, enabling them to produce coherent and contextually relevant responses."]

In [5]:
# data_path = "/home/luomingkai/workspace/issue_llm/issue_classify/issue_with_comments_framework/matched_results_test.json"
data_path = "/root/issue_classify/issue_with_comments_framework/matched_results_test_modify_other_update.json"
with open(data_path, encoding="utf-8") as fp:
    issue_data = json.load(fp)
    for idx, data in enumerate(issue_data):
        # print(issue_data)
        uesr, title, body, label, author_association = data["user"]["login"], data["title"], data["body"], data["tag_labels"], data["author_association"]
        comment_list = data["comments_list"]
        if len(comment_list) > 0:
            comment_list = [(com["user"]["login"], com["author_association"], com["body"]) for com in comment_list]

        print(f"uesr: {uesr}")
        print(f"Title: {title}")
        print(f"Body: {body}")
        print(f"label: {label}")
        print(f"author_association: {author_association}")
        print(f"comment_list: {comment_list}")
        
        
        # print()
        if idx > 2:
            break
        # print(title, description, label)


uesr: RainBoltz
Title: No output displayed or it gets stuck - OpenCV issue
Body: my terminal isn't responding after executed the command below: `./build/examples/openpose/rtpose.bin --imagelevel 1 --netcaffeopenpose.sh` , which was provided by official
label: deployment
author_association: NONE
comment_list: [('bigmoumou', 'NONE', 'I have the same problem too\r\n\r\nIs there any solution ?\r\n\r\n![help](https://cloud.githubusercontent.com/assets/16252975/25697753/cddcfe9e-30ee-11e7-82a2-2af97eb8460b.png)\r\n\r\n'), ('Shawnroom', 'NONE', 'I have the same issue, and wonder if there is any solution.'), ('gineshidalgo99', 'MEMBER', 'Sorry to hear that, we are working on fixing that error. We think it is due to OpenCV compiled with Qt or different visualization support.\r\n\r\nMeanwhile, you can make it work by:\r\n1. Completely uninstalling your current OpenCV version.\r\n2. Installing the default OpenCV from the Ubuntu repository: `apt-get install libopencv-dev`, or alternatively compili

In [6]:
global_issue_prompt = r"""
### **Role**  
You are an expert in GitHub repository analysis. Your task is to classify a given GitHub Issue into one of the following categories: **error**, **performance**, **deployment**, **question**, or **other**.

Analyze the conversation context thoroughly and determine the correct classification. If the issue is unrelated to the repository’s functionality or purpose, it must be categorized as **other**.

### **Issue Categories**  

- **error**:  
  Problems directly caused by the repository’s code, configuration, or inherent incompatibilities within the repository. For example:
  - Runtime errors.
  - Exceptions.
  - Failures in repository code execution.  

- **performance**:  
  Issues where the repository’s code or configuration leads to:
  - Slow execution times.
  - Resource bottlenecks (e.g., CPU, GPU, memory, or storage).
  - Inefficient resource usage (e.g., excessive memory or storage consumption).

- **deployment**:  
  Issues arising during the installation or deployment process that are specifically caused by:
  - Defects in the repository code.
  - Incomplete or inadequate documentation.
  - Configuration problems stemming from the repository.  

  *Note*: If the issue is caused by user errors (e.g., outdated dependencies, incorrect tools) or hardware/environmental limitations, it should be categorized as **question**.

- **question**:  
  Issues originating from:
  - Misunderstandings or failure to follow documentation.
  - Incorrect usage of the repository’s features.
  - Local environment misconfigurations not caused by the repository code or documentation.  

  *Note*: This category also includes:
  - User questions about usage scenarios.
  - Discussions seeking clarification or additional guidance.

- **other**:  
  Issues outside the predefined categories, including:
  1. Topics unrelated to the repository’s functionality.
  2. Feature requests or discussions beyond the repository’s purpose.
  3. Suggestions for improving documentation, usability, or community processes.


### **Process**  
**The process must be strictly executed, and the output must adhere to the defined JSON format.**

1. **Analyze the Conversation**:  
   Review the entire conversation context and evaluate the issue based on the evidence provided.

2. **Determine the Final Classification**:  
   Assign the issue to one of the five categories:
   - **error**
   - **performance**
   - **deployment**
   - **question**
   - **other**

3. **Output**:  
   Generate a single JSON object as the result, formatted as follows:  
   ```json
   {"issue_type": "<error|performance|deployment|question|other>"}
   ```

### **Examples**  

#### Example 1: Error
*Conversation:*  
- "Running model.fit() raises a KeyError related to missing labels in the dataset."  
- "We’ll patch data_loader.py to handle missing labels."  
- "The fix is merged. Let us know if it resolves the problem."

*Final Output:*  
```json
{"issue_type": "error"}
```

#### Example 2: Question
*Conversation:*  
- "I tried running the code, but the output looks weird. Is this a bug?"  
- "Have you verified if the input data matches the format described in the README?"  
- "I missed the formatting instructions. After fixing the input, it works fine."

*Final Output:*  
```json
{"issue_type": "question"}
```

#### Example 3: Performance
*Conversation:*  
- "The code runs much slower than expected for large datasets. Is there a way to optimize?"  
- "We could optimize the data processing step using multi-threading."

*Final Output:*  
```json
{"issue_type": "performance"}
```  

#### Example 4: Deployment
*Conversation:*  
- "The installation instructions don't mention the CUDA version required. The code fails on my setup."  
- "We’ll update the documentation to specify the supported CUDA versions."

*Final Output:*  
```json
{"issue_type": "deployment"}
```  

#### Example 5: Other
*Conversation:*  
- "It would be great if this repository supported visualization tools for monitoring model training."  
- "Thanks for the suggestion. We’ll consider this for future updates."

*Final Output:*  
```json
{"issue_type": "other"}
```  
"""

# 仅使用第一条issue的模版
question_prompt = """
````
*Conversation*:
- {}: ### Title: "{}" ### Body: "{}"
````
"""

# 使用comment的版本
question_comment_prompt = """
*Conversation*:
- "{} {}"
"""

comment_prompt = """
- "{}"
"""

ROLE_MAP_BEGIN = {
    "NONE": "ISSUE RAISER",
    "MEMBER": "MEMBER",
    "COLLABORATOR": "COLLABORATOR",
    "CONTRIBUTOR": "CONTRIBUTOR",
    "OWNER": "OWNER"
}

ROLE_MAP_COMMENT = {
    "NONE": "Commenter",
    "MEMBER": "MEMBER",
    "COLLABORATOR": "COLLABORATOR",
    "CONTRIBUTOR": "CONTRIBUTOR",
    "OWNER": "OWNER"
}

In [7]:
REPO_DESC = {
    "CMU-Perceptual-Computing-Lab/openpose": {
        "description": "OpenPose is a real-time multi-person keypoint detection library developed by the Carnegie Mellon Perceptual Computing Lab. It estimates human body, face, hands, and foot keypoints from single images, providing 2D real-time multi-person keypoint detection, including 15, 18, or 25-keypoint body/foot keypoint estimation, 2x21-keypoint hand keypoint estimation, and 70-keypoint face keypoint estimation. Additionally, it offers 3D real-time single-person keypoint detection and a calibration toolbox. OpenPose is compatible with various operating systems, including Ubuntu, Windows, and macOS, and supports CUDA (Nvidia GPU), OpenCL (AMD GPU), and CPU-only versions.",
        "url": "https://github.com/CMU-Perceptual-Computing-Lab/openpose"
    },
    "CorentinJ/Real-Time-Voice-Cloning": {
        "description": "Real-Time Voice Cloning is a Python-based tool that enables the cloning of voices in real-time. It utilizes deep learning models to synthesize speech that mimics a target voice, requiring only a few seconds of audio from the desired speaker. The repository provides code and instructions to train and use the voice cloning system.",
        "url": "https://github.com/CorentinJ/Real-Time-Voice-Cloning"
    },
    "JaidedAI/EasyOCR": {
        "description": "EasyOCR is an open-source Optical Character Recognition (OCR) library that supports over 80 languages. It is designed to be easy to use and provides accurate text recognition from images. The library is built on PyTorch and offers pre-trained models for various languages and scripts.",
        "url": "https://github.com/JaidedAI/EasyOCR"
    },
    "deepfakes/faceswap": {
        "description": "Faceswap is a deep learning-based tool for face-swapping in images and videos. It allows users to train models to swap faces between different subjects, providing a platform for experimenting with deepfake technology. The repository includes code for training and using the face-swapping models.",
        "url": "https://github.com/deepfakes/faceswap"
    },
    "deezer/spleeter": {
        "description": "Spleeter is an open-source tool developed by Deezer for source separation in music tracks. It uses deep learning models to separate audio into stems, such as vocals and accompaniment, enabling applications like karaoke and remixing. The repository provides pre-trained models and code for audio source separation.",
        "url": "https://github.com/deezer/spleeter"
    },
    "dusty-nv/jetson-inference": {
        "description": "Jetson Inference is a collection of deep learning inference samples and models for NVIDIA Jetson devices. It includes code for image classification, object detection, and segmentation, optimized for Jetson hardware. The repository provides pre-trained models and examples to demonstrate the capabilities of Jetson devices in AI applications.",
        "url": "https://github.com/dusty-nv/jetson-inference"
    },
    "iperov/DeepFaceLab": {
        "description": "DeepFaceLab is a deep learning tool for creating deepfakes, focusing on face-swapping in videos. It provides a comprehensive set of tools for training and applying deep learning models to perform face-swapping tasks. The repository includes code for data preparation, model training, and face-swapping applications.",
        "url": "https://github.com/iperov/DeepFaceLab"
    },
    "junyanz/pytorch-CycleGAN-and-pix2pix": {
        "description": "This repository provides PyTorch implementations of CycleGAN and pix2pix, two popular models for image-to-image translation tasks. CycleGAN enables image translation without paired examples, while pix2pix requires paired images for training. The repository includes code and pre-trained models for various image translation tasks.",
        "url": "https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix"
    },
    "mozilla/TTS": {
        "description": "Mozilla TTS is an open-source Text-to-Speech (TTS) engine that aims to make speech synthesis more accessible. It provides implementations of state-of-the-art TTS models, including Tacotron and FastSpeech, and supports training on custom datasets. The repository includes code for training and using TTS models.",
        "url": "https://github.com/mozilla/TTS"
    },
    "streamlit/streamlit": {
        "description": "Streamlit is an open-source app framework for Machine Learning and Data Science projects. It allows users to create interactive web applications for data analysis and visualization with minimal code. The repository provides the core framework and examples for building Streamlit applications.",
        "url": "https://github.com/streamlit/streamlit"
    },
    "microsoft/recommenders": {
        "description": "Recommenders is a project under the Linux Foundation of AI and Data. This repository contains examples and best practices for building recommendation systems, provided as Jupyter notebooks. The examples detail learnings on five key tasks: preparing and loading data for each recommendation algorithm, building models using various classical and deep learning recommendation algorithms such as Alternating Least Squares (ALS) or eXtreme Deep Factorization Machines (xDeepFM), evaluating algorithms with offline metrics, tuning and optimizing hyperparameters for recommendation models, and operationalizing models in a production environment on Azure. Several utilities are provided to support common tasks such as loading datasets in the format expected by different algorithms, evaluating model outputs, and splitting training/test data. Implementations of several state-of-the-art algorithms are included for self-study and customization in your own applications.",
        "url": "https://github.com/recommenders-team/recommenders"
    }
}

In [8]:
import pandas as pd
from sklearn.metrics import classification_report

TIMES = 10
experiments = []
# 保存所有实验结果的列表
all_reports = []

data_path = "/root/issue_classify/issue_with_comments_framework/matched_results_test_modify_other_update.json"
result_path = "/root/autodl-fs/log/qwen2.5_14b_prompt.xlsx"

for t in range(TIMES):
    origin_labels = []
    pred_labels = []
    
    join_index = []
    with open(data_path, encoding="utf-8") as fp:
        issue_data = json.load(fp)
        taged_data = issue_data
        all_messages = []
        for idx, data in enumerate(issue_data):
            raise_user, title, body, label, author_association = data["user"]["login"], data["title"], data["body"], data["tag_labels"], data["author_association"]
    
            url = data["html_url"]
            match = re.search(r"github\.com/([^/]+/[^/]+)", url)
            repository = match.group(1)
            repo_desc = REPO_DESC[repository]["description"]
            
            join_index.append(idx)
            
            comment_list = data["comments_list"]
            if len(comment_list) > 0:
                comment_list = [(com["user"]["login"], com["author_association"], com["body"]) for com in comment_list]
    
            messages = [
                {
                    "role": "system",
                    "content": global_issue_prompt
                },
                {
                    "role": "user",
                    "content": question_comment_prompt.format(
                        title, 
                        body
                        )
                }
            ]
            
            origin_labels.append(label)
    
            for user, author_association, body in comment_list:
                messages.append(
                    {
                        "role": "user",
                        "content": comment_prompt.format(
                            body
                            )
                    }
                )
                    
            all_messages.append(messages)
        
        responses = get_qwen_output(model, tokenizer, all_messages, max_tokens=4096)
        for idx, response in enumerate(responses):
            label = origin_labels[idx]
            cls_result = extract_json_from_string(response)
            
            # print(f"label: {label}")
            # print(f"response: {response}")
            # print(f"cls_result: {cls_result}")
    
            if cls_result.get("issue_type") and isinstance(cls_result.get("issue_type"), str):
                cls_result = cls_result.get("issue_type")
                pred_labels.append(cls_result)
            else:
                pred_labels.append("other")


    origin_labels = [x.lower() for x in origin_labels]
    pred_labels = [x.lower() for x in pred_labels]
    tmp_pred_labels = []
    for x in pred_labels:
        if x not in ["error", "performance", "deployment", "question", "other"]:
            x = 'other'
        tmp_pred_labels.append(x)
    pred_labels = tmp_pred_labels
    
    from sklearn.metrics import classification_report
    
    labels = ["error", "performance", "deployment", "question", "other"]
    label_map = {
        "error": 0, 
        "performance": 1,
        "deployment": 2,
        "question": 3,
        "other": 4
    }
    final_origin_labels_num = [label_map[l] for l in origin_labels]
    final_pred_labels_num = [label_map[l] for l in pred_labels]
    
    report = classification_report(final_origin_labels_num, final_pred_labels_num, target_names=labels)
    print(report)
    report_dict = classification_report(final_origin_labels_num, final_pred_labels_num, target_names=labels, output_dict=True)
    # 将字典转为 DataFrame
    df_report = pd.DataFrame(report_dict).transpose()
    df_report["experiment_id"] = t + 1  # 添加实验编号
    df_report.index.name = "category"
    df_report.reset_index(inplace=True)
    
    all_reports.append(df_report)
    all_reports.append(pd.DataFrame({"category": [""]}))  # 添加空行

final_df = pd.concat(all_reports, ignore_index=True)
final_df.to_excel(result_path, index=False)

Processed prompts:   0%|          | 1/1933 [00:33<18:02:10, 33.61s/it, est. speed input: 31.06 toks/s, output: 0.36 toks/s]



Processed prompts:  47%|████▋     | 899/1933 [08:06<09:14,  1.87it/s, est. speed input: 2739.34 toks/s, output: 141.39 toks/s]



Processed prompts:  74%|███████▎  | 1421/1933 [12:21<03:06,  2.74it/s, est. speed input: 2896.39 toks/s, output: 143.76 toks/s]



Processed prompts:  80%|████████  | 1553/1933 [13:20<03:09,  2.01it/s, est. speed input: 2966.67 toks/s, output: 145.60 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:54<00:00,  2.02it/s, est. speed input: 3119.36 toks/s, output: 155.12 toks/s]


              precision    recall  f1-score   support

       error       0.48      0.89      0.62       251
 performance       0.35      0.67      0.46        60
  deployment       0.35      0.73      0.48       165
    question       0.78      0.59      0.67       921
       other       0.91      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.68      0.58      1933
weighted avg       0.72      0.62      0.64      1933



Processed prompts:  46%|████▋     | 898/1933 [08:01<10:27,  1.65it/s, est. speed input: 2764.02 toks/s, output: 142.74 toks/s]  



Processed prompts:  73%|███████▎  | 1417/1933 [12:15<02:58,  2.89it/s, est. speed input: 2914.66 toks/s, output: 145.42 toks/s]



Processed prompts:  80%|████████  | 1554/1933 [13:15<02:45,  2.29it/s, est. speed input: 2984.30 toks/s, output: 147.52 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:48<00:00,  2.04it/s, est. speed input: 3140.19 toks/s, output: 156.96 toks/s]


              precision    recall  f1-score   support

       error       0.47      0.89      0.62       251
 performance       0.33      0.67      0.44        60
  deployment       0.37      0.78      0.50       165
    question       0.78      0.59      0.67       921
       other       0.92      0.50      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.69      0.58      1933
weighted avg       0.73      0.62      0.64      1933



Processed prompts:  47%|████▋     | 899/1933 [08:00<08:34,  2.01it/s, est. speed input: 2773.28 toks/s, output: 145.19 toks/s]



Processed prompts:  73%|███████▎  | 1416/1933 [12:14<03:16,  2.63it/s, est. speed input: 2917.53 toks/s, output: 145.75 toks/s]



Processed prompts:  81%|████████  | 1558/1933 [13:15<02:38,  2.36it/s, est. speed input: 2989.20 toks/s, output: 147.88 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:48<00:00,  2.04it/s, est. speed input: 3140.52 toks/s, output: 156.18 toks/s]


              precision    recall  f1-score   support

       error       0.47      0.89      0.62       251
 performance       0.33      0.65      0.44        60
  deployment       0.37      0.76      0.50       165
    question       0.78      0.59      0.67       921
       other       0.92      0.52      0.67       536

    accuracy                           0.63      1933
   macro avg       0.57      0.68      0.58      1933
weighted avg       0.73      0.63      0.64      1933



Processed prompts:  46%|████▋     | 898/1933 [08:00<08:43,  1.98it/s, est. speed input: 2774.45 toks/s, output: 142.98 toks/s]



Processed prompts:  73%|███████▎  | 1419/1933 [12:16<02:59,  2.87it/s, est. speed input: 2917.42 toks/s, output: 144.57 toks/s]



Processed prompts:  80%|████████  | 1555/1933 [13:15<03:27,  1.82it/s, est. speed input: 2985.10 toks/s, output: 146.70 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:47<00:00,  2.04it/s, est. speed input: 3143.66 toks/s, output: 155.23 toks/s]


              precision    recall  f1-score   support

       error       0.48      0.88      0.62       251
 performance       0.34      0.70      0.46        60
  deployment       0.36      0.78      0.50       165
    question       0.78      0.59      0.67       921
       other       0.92      0.51      0.66       536

    accuracy                           0.63      1933
   macro avg       0.58      0.69      0.58      1933
weighted avg       0.73      0.63      0.64      1933



Processed prompts:  47%|████▋     | 901/1933 [08:05<07:36,  2.26it/s, est. speed input: 2748.32 toks/s, output: 141.50 toks/s]



Processed prompts:  73%|███████▎  | 1419/1933 [12:21<03:32,  2.41it/s, est. speed input: 2898.93 toks/s, output: 143.74 toks/s]



Processed prompts:  80%|████████  | 1556/1933 [13:20<02:30,  2.51it/s, est. speed input: 2969.91 toks/s, output: 146.04 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:54<00:00,  2.03it/s, est. speed input: 3120.87 toks/s, output: 155.08 toks/s]


              precision    recall  f1-score   support

       error       0.47      0.88      0.61       251
 performance       0.34      0.68      0.46        60
  deployment       0.34      0.73      0.47       165
    question       0.77      0.58      0.66       921
       other       0.91      0.50      0.65       536

    accuracy                           0.61      1933
   macro avg       0.57      0.68      0.57      1933
weighted avg       0.72      0.61      0.63      1933



Processed prompts:  47%|████▋     | 900/1933 [08:01<08:01,  2.15it/s, est. speed input: 2768.64 toks/s, output: 143.12 toks/s]



Processed prompts:  73%|███████▎  | 1419/1933 [12:17<03:51,  2.22it/s, est. speed input: 2913.30 toks/s, output: 145.41 toks/s]



Processed prompts:  80%|████████  | 1552/1933 [13:16<04:24,  1.44it/s, est. speed input: 2982.07 toks/s, output: 147.54 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:49<00:00,  2.04it/s, est. speed input: 3137.76 toks/s, output: 157.08 toks/s]


              precision    recall  f1-score   support

       error       0.48      0.89      0.63       251
 performance       0.34      0.68      0.45        60
  deployment       0.37      0.77      0.50       165
    question       0.78      0.60      0.67       921
       other       0.91      0.51      0.65       536

    accuracy                           0.63      1933
   macro avg       0.58      0.69      0.58      1933
weighted avg       0.73      0.63      0.64      1933



Processed prompts:  47%|████▋     | 899/1933 [08:03<07:41,  2.24it/s, est. speed input: 2755.05 toks/s, output: 143.69 toks/s] 



Processed prompts:  74%|███████▎  | 1422/1933 [12:20<03:05,  2.75it/s, est. speed input: 2900.65 toks/s, output: 145.80 toks/s]



Processed prompts:  80%|████████  | 1555/1933 [13:18<02:24,  2.62it/s, est. speed input: 2973.33 toks/s, output: 147.73 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:52<00:00,  2.03it/s, est. speed input: 3127.26 toks/s, output: 156.57 toks/s]


              precision    recall  f1-score   support

       error       0.48      0.88      0.62       251
 performance       0.34      0.68      0.45        60
  deployment       0.36      0.76      0.49       165
    question       0.79      0.60      0.68       921
       other       0.92      0.52      0.66       536

    accuracy                           0.63      1933
   macro avg       0.58      0.69      0.58      1933
weighted avg       0.73      0.63      0.64      1933



Processed prompts:  73%|███████▎  | 1420/1933 [12:19<03:00,  2.84it/s, est. speed input: 2902.32 toks/s, output: 144.70 toks/s]



Processed prompts:  80%|████████  | 1555/1933 [13:18<04:04,  1.55it/s, est. speed input: 2972.69 toks/s, output: 146.12 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:51<00:00,  2.03it/s, est. speed input: 3129.41 toks/s, output: 155.00 toks/s]


              precision    recall  f1-score   support

       error       0.48      0.89      0.62       251
 performance       0.33      0.67      0.44        60
  deployment       0.36      0.76      0.49       165
    question       0.77      0.58      0.67       921
       other       0.91      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.68      0.57      1933
weighted avg       0.72      0.62      0.63      1933



Processed prompts:  47%|████▋     | 901/1933 [08:00<07:22,  2.33it/s, est. speed input: 2773.55 toks/s, output: 141.24 toks/s]



Processed prompts:  73%|███████▎  | 1419/1933 [12:15<03:18,  2.59it/s, est. speed input: 2918.02 toks/s, output: 143.48 toks/s]



Processed prompts:  80%|████████  | 1551/1933 [13:13<06:08,  1.04it/s, est. speed input: 2990.39 toks/s, output: 144.81 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:48<00:00,  2.04it/s, est. speed input: 3140.12 toks/s, output: 154.63 toks/s]


              precision    recall  f1-score   support

       error       0.47      0.89      0.62       251
 performance       0.34      0.67      0.45        60
  deployment       0.37      0.76      0.50       165
    question       0.78      0.59      0.67       921
       other       0.91      0.52      0.66       536

    accuracy                           0.63      1933
   macro avg       0.57      0.69      0.58      1933
weighted avg       0.73      0.63      0.64      1933



Processed prompts:  47%|████▋     | 899/1933 [08:00<06:51,  2.51it/s, est. speed input: 2774.36 toks/s, output: 142.70 toks/s]



Processed prompts:  73%|███████▎  | 1416/1933 [12:14<03:04,  2.80it/s, est. speed input: 2917.05 toks/s, output: 144.85 toks/s]



Processed prompts:  80%|████████  | 1553/1933 [13:13<02:36,  2.42it/s, est. speed input: 2995.02 toks/s, output: 146.48 toks/s]



Processed prompts: 100%|██████████| 1933/1933 [15:49<00:00,  2.04it/s, est. speed input: 3136.34 toks/s, output: 155.97 toks/s]


              precision    recall  f1-score   support

       error       0.48      0.89      0.62       251
 performance       0.33      0.65      0.43        60
  deployment       0.38      0.78      0.51       165
    question       0.78      0.59      0.67       921
       other       0.90      0.52      0.66       536

    accuracy                           0.63      1933
   macro avg       0.57      0.69      0.58      1933
weighted avg       0.72      0.63      0.64      1933



ModuleNotFoundError: No module named 'openpyxl'

In [10]:
final_df = pd.concat(all_reports, ignore_index=True)
final_df.to_excel(result_path, index=False)

In [11]:
result_path

'/root/autodl-fs/log/qwen2.5_14b_prompt.xlsx'

In [9]:
!pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting openpyxl
  Downloading http://mirrors.aliyun.com/pypi/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading http://mirrors.aliyun.com/pypi/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[0m

In [None]:
import gc
import torch

# 假设你的 vllm 模型对象是 `model`
del model  # 删除模型对象
torch.cuda.empty_cache()  # 清空 GPU 的缓存
gc.collect()  # 强制进行垃圾回收

import sys
# 删除 `vllm` 和相关依赖
del sys.modules["vllm"]
torch.cuda.empty_cache()
gc.collect()

In [27]:
# all question类别打标之后的
from collections import Counter
print(Counter(pred_labels))
origin_labels = [x.lower() for x in origin_labels]
pred_labels = [x.lower() for x in pred_labels]
tmp_pred_labels = []
for x in pred_labels:
    if x not in ["error", "performance", "deployment", "question", "other"]:
        x = 'other'
    tmp_pred_labels.append(x)
pred_labels = tmp_pred_labels

from sklearn.metrics import classification_report

labels = ["error", "performance", "deployment", "question", "other"]
label_map = {
    "error": 0, 
    "performance": 1,
    "deployment": 2,
    "question": 3,
    "other": 4
}
final_origin_labels_num = [label_map[l] for l in origin_labels]
final_pred_labels_num = [label_map[l] for l in pred_labels]

report = classification_report(final_origin_labels_num, final_pred_labels_num, target_names=labels)
print(report)

Counter({'question': 1030, 'error': 322, 'other': 298, 'deployment': 242, 'performance': 37, 'feature request': 2, 'feature_request': 2})
              precision    recall  f1-score   support

       error       0.53      0.68      0.59       251
 performance       0.43      0.27      0.33        60
  deployment       0.42      0.62      0.50       165
    question       0.69      0.77      0.73       921
       other       0.79      0.44      0.57       536

    accuracy                           0.64      1933
   macro avg       0.57      0.55      0.54      1933
weighted avg       0.66      0.64      0.63      1933

