In [1]:
import json
import re

def extract_json_from_string(input_string):
    """
    Extract JSON-like content from a string, handling extra or mismatched braces, 
    and parse it into a Python dictionary.

    Args:
    input_string (str): The string containing JSON-like content.

    Returns:
    dict: Parsed JSON content as a dictionary, or an empty dictionary if parsing fails.
    """
    try:
        # Use regex to find JSON-like content
        json_pattern = re.compile(r'{.*?}', re.DOTALL)
        match = json_pattern.search(input_string)

        if match:
            # Handle potential double braces
            json_content = match.group()
            json_content = json_content.replace('{{', '{').replace('}}', '}')
            
            # Try parsing the adjusted JSON content
            return json.loads(json_content)
    except json.JSONDecodeError:
        pass

    # Return empty dictionary if no valid JSON is found
    return {}

# Example usage
input_string_1 = """
Some random text
{
    "issue_type": "other"
}
More text here
"""

input_string_2 = """
Some text with extra braces {{
    "issue_type": "error"
}}
End of text
"""

parsed_json_1 = extract_json_from_string(input_string_1)
parsed_json_2 = extract_json_from_string(input_string_2)

parsed_json_1, parsed_json_2

({'issue_type': 'other'}, {'issue_type': 'error'})

In [2]:
# BASE_MODEL_DIR = "/data/luomingkai/issue/models/Qwen/Qwen2.5-7B-Instruct"

BASE_MODEL_DIR = "/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/Qwen2.5-14B-Instruct"


In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7,4,5,6"

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_DIR)

# Pass the default decoding hyperparameters of Qwen2.5-7B-Instruct
# max_tokens is for the maximum length for generation.
sampling_params = SamplingParams(temperature=0.5, top_p=1.0, repetition_penalty=1.05, max_tokens=512)

# Input the model name or path. Can be GPTQ or AWQ models.
model = LLM(model=BASE_MODEL_DIR, tensor_parallel_size=4)

# Prepare your prompts
prompt = "Tell me something about large language models."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# generate outputs
outputs = model.generate([text], sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


INFO 04-08 19:15:55 config.py:905] Defaulting to use mp for distributed inference
INFO 04-08 19:15:55 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/Qwen2.5-14B-Instruct', speculative_config=None, tokenizer='/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/Qwen2.5-14B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forw

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=1992350)[0;0m INFO 04-08 19:17:50 model_runner.py:1067] Loading model weights took 6.9459 GB
INFO 04-08 19:17:50 model_runner.py:1067] Loading model weights took 6.9459 GB
[1;36m(VllmWorkerProcess pid=1992351)[0;0m INFO 04-08 19:17:50 model_runner.py:1067] Loading model weights took 6.9459 GB
[1;36m(VllmWorkerProcess pid=1992349)[0;0m INFO 04-08 19:17:50 model_runner.py:1067] Loading model weights took 6.9459 GB
INFO 04-08 19:18:14 distributed_gpu_executor.py:57] # GPU blocks: 15551, # CPU blocks: 5461
INFO 04-08 19:18:14 distributed_gpu_executor.py:61] Maximum concurrency for 32768 tokens per request: 7.59x
[1;36m(VllmWorkerProcess pid=1992350)[0;0m INFO 04-08 19:18:25 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[1;36m(VllmWorkerProcess pid=1992350)[0;0m INFO 04-08 19:18:

Processed prompts: 100%|█████████████████| 1/1 [00:03<00:00,  3.59s/it, est. speed input: 10.31 toks/s, output: 61.59 toks/s]

Prompt: '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nTell me something about large language models.<|im_end|>\n<|im_start|>assistant\n', Generated text: "Large language models, like the one you're interacting with now, are sophisticated artificial intelligence systems designed to understand and generate human-like text based on the vast amount of text data they've been trained on. These models are characterized by their enormous size, typically measured in billions of parameters, which allows them to capture complex patterns and nuances in natural language.\n\nOne key aspect of these models is their ability to perform a wide range of tasks without being specifically programmed for each one. This versatility comes from their training process, where they learn from diverse datasets that include books, articles, websites, and more. As a result, they can generate text, answer questions, translate languages, write sto




[1;36m(VllmWorkerProcess pid=1992350)[0;0m [1;36m(VllmWorkerProcess pid=1992349)[0;0m [1;36m(VllmWorkerProcess pid=1992351)[0;0m INFO 04-09 01:19:52 multiproc_worker_utils.py:240] Worker exiting
INFO 04-09 01:19:52 multiproc_worker_utils.py:240] Worker exiting
INFO 04-09 01:19:52 multiproc_worker_utils.py:240] Worker exiting


In [4]:
def get_qwen_output(
    model,
    tokenizer,
    messages_list,
    max_input_length=4096,
    max_tokens=512,
):
    text_list = []
    for messages in messages_list:
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        text_list.append(text)
        
    # sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
    # sampling_params = SamplingParams(temperature=0.5, top_p=1.0, repetition_penalty=1.05, max_tokens=512)
    sampling_params = SamplingParams(temperature=0.3, top_p=1.0, repetition_penalty=1.05, max_tokens=max_tokens)
    

    outputs = model.generate(text_list, sampling_params)
    
    # Print the outputs.
    responses = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        # print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        responses.append(generated_text)

    return responses

messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": "Give me a short introduction to large language model."}
]
get_qwen_output(model, tokenizer, [messages])

Processed prompts:   0%|                           | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|█████████████████| 1/1 [00:02<00:00,  2.02s/it, est. speed input: 19.29 toks/s, output: 63.32 toks/s]


['A large language model is an artificial intelligence system designed to understand and generate human-like text based on the patterns it has learned from vast amounts of textual data. These models are typically trained using deep learning techniques on diverse datasets that can include books, articles, websites, and other written content. By processing this extensive corpus, they learn to predict the likelihood of certain words following others, which allows them to generate coherent responses to a wide variety of prompts or questions. Large language models are used in many applications such as chatbots, automated customer service, content creation, and more, providing capabilities that range from simple text completion to complex dialogue systems.']

In [5]:
# data_path = "/home/luomingkai/workspace/issue_llm/issue_classify/issue_with_comments_framework/matched_results_test.json"
data_path = "/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/matched_results_test_modify_other_update.json"
with open(data_path, encoding="utf-8") as fp:
    issue_data = json.load(fp)
    for idx, data in enumerate(issue_data):
        # print(issue_data)
        uesr, title, body, label, author_association = data["user"]["login"], data["title"], data["body"], data["tag_labels"], data["author_association"]
        comment_list = data["comments_list"]
        if len(comment_list) > 0:
            comment_list = [(com["user"]["login"], com["author_association"], com["body"]) for com in comment_list]

        print(f"uesr: {uesr}")
        print(f"Title: {title}")
        print(f"Body: {body}")
        print(f"label: {label}")
        print(f"author_association: {author_association}")
        print(f"comment_list: {comment_list}")
        
        
        # print()
        if idx > 2:
            break
        # print(title, description, label)


uesr: RainBoltz
Title: No output displayed or it gets stuck - OpenCV issue
Body: my terminal isn't responding after executed the command below: `./build/examples/openpose/rtpose.bin --imagelevel 1 --netcaffeopenpose.sh` , which was provided by official
label: deployment
author_association: NONE
comment_list: [('bigmoumou', 'NONE', 'I have the same problem too\r\n\r\nIs there any solution ?\r\n\r\n![help](https://cloud.githubusercontent.com/assets/16252975/25697753/cddcfe9e-30ee-11e7-82a2-2af97eb8460b.png)\r\n\r\n'), ('Shawnroom', 'NONE', 'I have the same issue, and wonder if there is any solution.'), ('gineshidalgo99', 'MEMBER', 'Sorry to hear that, we are working on fixing that error. We think it is due to OpenCV compiled with Qt or different visualization support.\r\n\r\nMeanwhile, you can make it work by:\r\n1. Completely uninstalling your current OpenCV version.\r\n2. Installing the default OpenCV from the Ubuntu repository: `apt-get install libopencv-dev`, or alternatively compili

In [6]:
global_issue_prompt = r"""
### **Role**  
You are an expert in GitHub repository analysis. Your task is to classify a given GitHub Issue into one of the following categories: **error**, **performance**, **deployment**, **question**, or **other**.

Analyze the conversation context thoroughly and determine the correct classification. If the issue is unrelated to the repository’s functionality or purpose, it must be categorized as **other**.

### **Issue Categories**  

- **error**:  
  Problems directly caused by the repository’s code, configuration, or inherent incompatibilities within the repository. For example:
  - Runtime errors.
  - Exceptions.
  - Failures in repository code execution.  

- **performance**:  
  Issues where the repository’s code or configuration leads to:
  - Slow execution times.
  - Resource bottlenecks (e.g., CPU, GPU, memory, or storage).
  - Inefficient resource usage (e.g., excessive memory or storage consumption).

- **deployment**:  
  Issues arising during the installation or deployment process that are specifically caused by:
  - Defects in the repository code.
  - Incomplete or inadequate documentation.
  - Configuration problems stemming from the repository.  

  *Note*: If the issue is caused by user errors (e.g., outdated dependencies, incorrect tools) or hardware/environmental limitations, it should be categorized as **question**.

- **question**:  
  Issues originating from:
  - Misunderstandings or failure to follow documentation.
  - Incorrect usage of the repository’s features.
  - Local environment misconfigurations not caused by the repository code or documentation.  

  *Note*: This category also includes:
  - User questions about usage scenarios.
  - Discussions seeking clarification or additional guidance.

- **other**:  
  Issues outside the predefined categories, including:
  1. Topics unrelated to the repository’s functionality.
  2. Feature requests or discussions beyond the repository’s purpose.
  3. Suggestions for improving documentation, usability, or community processes.


### **Process**  
**The process must be strictly executed, and the output must adhere to the defined JSON format.**

1. **Analyze the Conversation**:  
   Review the entire conversation context and evaluate the issue based on the evidence provided.

2. **Determine the Final Classification**:  
   Assign the issue to one of the five categories:
   - **error**
   - **performance**
   - **deployment**
   - **question**
   - **other**

3. **Output**:  
   Generate a single JSON object as the result, formatted as follows:  
   ```json
   {"issue_type": "<error|performance|deployment|question|other>"}
   ```

### **Examples**  

#### Example 1: Error
*Conversation:*  
- "Running model.fit() raises a KeyError related to missing labels in the dataset."  
- "We’ll patch data_loader.py to handle missing labels."  
- "The fix is merged. Let us know if it resolves the problem."

*Final Output:*  
```json
{"issue_type": "error"}
```

#### Example 2: Question
*Conversation:*  
- "I tried running the code, but the output looks weird. Is this a bug?"  
- "Have you verified if the input data matches the format described in the README?"  
- "I missed the formatting instructions. After fixing the input, it works fine."

*Final Output:*  
```json
{"issue_type": "question"}
```

#### Example 3: Performance
*Conversation:*  
- "The code runs much slower than expected for large datasets. Is there a way to optimize?"  
- "We could optimize the data processing step using multi-threading."

*Final Output:*  
```json
{"issue_type": "performance"}
```  

#### Example 4: Deployment
*Conversation:*  
- "The installation instructions don't mention the CUDA version required. The code fails on my setup."  
- "We’ll update the documentation to specify the supported CUDA versions."

*Final Output:*  
```json
{"issue_type": "deployment"}
```  

#### Example 5: Other
*Conversation:*  
- "It would be great if this repository supported visualization tools for monitoring model training."  
- "Thanks for the suggestion. We’ll consider this for future updates."

*Final Output:*  
```json
{"issue_type": "other"}
```  
"""

# 仅使用第一条issue的模版
question_prompt = """
````
*Conversation*:
- {}: ### Title: "{}" ### Body: "{}"
````
"""

# 使用comment的版本
question_comment_prompt = """
*Conversation*:
- "{} {}"
"""

comment_prompt = """
- "{}"
"""

ROLE_MAP_BEGIN = {
    "NONE": "ISSUE RAISER",
    "MEMBER": "MEMBER",
    "COLLABORATOR": "COLLABORATOR",
    "CONTRIBUTOR": "CONTRIBUTOR",
    "OWNER": "OWNER"
}

ROLE_MAP_COMMENT = {
    "NONE": "Commenter",
    "MEMBER": "MEMBER",
    "COLLABORATOR": "COLLABORATOR",
    "CONTRIBUTOR": "CONTRIBUTOR",
    "OWNER": "OWNER"
}

In [7]:
REPO_DESC = {
    "CMU-Perceptual-Computing-Lab/openpose": {
        "description": "OpenPose is a real-time multi-person keypoint detection library developed by the Carnegie Mellon Perceptual Computing Lab. It estimates human body, face, hands, and foot keypoints from single images, providing 2D real-time multi-person keypoint detection, including 15, 18, or 25-keypoint body/foot keypoint estimation, 2x21-keypoint hand keypoint estimation, and 70-keypoint face keypoint estimation. Additionally, it offers 3D real-time single-person keypoint detection and a calibration toolbox. OpenPose is compatible with various operating systems, including Ubuntu, Windows, and macOS, and supports CUDA (Nvidia GPU), OpenCL (AMD GPU), and CPU-only versions.",
        "url": "https://github.com/CMU-Perceptual-Computing-Lab/openpose"
    },
    "CorentinJ/Real-Time-Voice-Cloning": {
        "description": "Real-Time Voice Cloning is a Python-based tool that enables the cloning of voices in real-time. It utilizes deep learning models to synthesize speech that mimics a target voice, requiring only a few seconds of audio from the desired speaker. The repository provides code and instructions to train and use the voice cloning system.",
        "url": "https://github.com/CorentinJ/Real-Time-Voice-Cloning"
    },
    "JaidedAI/EasyOCR": {
        "description": "EasyOCR is an open-source Optical Character Recognition (OCR) library that supports over 80 languages. It is designed to be easy to use and provides accurate text recognition from images. The library is built on PyTorch and offers pre-trained models for various languages and scripts.",
        "url": "https://github.com/JaidedAI/EasyOCR"
    },
    "deepfakes/faceswap": {
        "description": "Faceswap is a deep learning-based tool for face-swapping in images and videos. It allows users to train models to swap faces between different subjects, providing a platform for experimenting with deepfake technology. The repository includes code for training and using the face-swapping models.",
        "url": "https://github.com/deepfakes/faceswap"
    },
    "deezer/spleeter": {
        "description": "Spleeter is an open-source tool developed by Deezer for source separation in music tracks. It uses deep learning models to separate audio into stems, such as vocals and accompaniment, enabling applications like karaoke and remixing. The repository provides pre-trained models and code for audio source separation.",
        "url": "https://github.com/deezer/spleeter"
    },
    "dusty-nv/jetson-inference": {
        "description": "Jetson Inference is a collection of deep learning inference samples and models for NVIDIA Jetson devices. It includes code for image classification, object detection, and segmentation, optimized for Jetson hardware. The repository provides pre-trained models and examples to demonstrate the capabilities of Jetson devices in AI applications.",
        "url": "https://github.com/dusty-nv/jetson-inference"
    },
    "iperov/DeepFaceLab": {
        "description": "DeepFaceLab is a deep learning tool for creating deepfakes, focusing on face-swapping in videos. It provides a comprehensive set of tools for training and applying deep learning models to perform face-swapping tasks. The repository includes code for data preparation, model training, and face-swapping applications.",
        "url": "https://github.com/iperov/DeepFaceLab"
    },
    "junyanz/pytorch-CycleGAN-and-pix2pix": {
        "description": "This repository provides PyTorch implementations of CycleGAN and pix2pix, two popular models for image-to-image translation tasks. CycleGAN enables image translation without paired examples, while pix2pix requires paired images for training. The repository includes code and pre-trained models for various image translation tasks.",
        "url": "https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix"
    },
    "mozilla/TTS": {
        "description": "Mozilla TTS is an open-source Text-to-Speech (TTS) engine that aims to make speech synthesis more accessible. It provides implementations of state-of-the-art TTS models, including Tacotron and FastSpeech, and supports training on custom datasets. The repository includes code for training and using TTS models.",
        "url": "https://github.com/mozilla/TTS"
    },
    "streamlit/streamlit": {
        "description": "Streamlit is an open-source app framework for Machine Learning and Data Science projects. It allows users to create interactive web applications for data analysis and visualization with minimal code. The repository provides the core framework and examples for building Streamlit applications.",
        "url": "https://github.com/streamlit/streamlit"
    },
    "microsoft/recommenders": {
        "description": "Recommenders is a project under the Linux Foundation of AI and Data. This repository contains examples and best practices for building recommendation systems, provided as Jupyter notebooks. The examples detail learnings on five key tasks: preparing and loading data for each recommendation algorithm, building models using various classical and deep learning recommendation algorithms such as Alternating Least Squares (ALS) or eXtreme Deep Factorization Machines (xDeepFM), evaluating algorithms with offline metrics, tuning and optimizing hyperparameters for recommendation models, and operationalizing models in a production environment on Azure. Several utilities are provided to support common tasks such as loading datasets in the format expected by different algorithms, evaluating model outputs, and splitting training/test data. Implementations of several state-of-the-art algorithms are included for self-study and customization in your own applications.",
        "url": "https://github.com/recommenders-team/recommenders"
    }
}

In [8]:
import pandas as pd
from sklearn.metrics import classification_report

TIMES = 10
experiments = []
# 保存所有实验结果的列表
all_reports = []

data_path = "/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/matched_results_test_modify_other_update.json"
result_path = "/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/qwen2.5_14b_prompt.xlsx"

for t in range(TIMES):
    origin_labels = []
    pred_labels = []
    
    join_index = []
    with open(data_path, encoding="utf-8") as fp:
        issue_data = json.load(fp)
        taged_data = issue_data
        all_messages = []
        for idx, data in enumerate(issue_data):
            raise_user, title, body, label, author_association = data["user"]["login"], data["title"], data["body"], data["tag_labels"], data["author_association"]
    
            url = data["html_url"]
            match = re.search(r"github\.com/([^/]+/[^/]+)", url)
            repository = match.group(1)
            repo_desc = REPO_DESC[repository]["description"]
            
            join_index.append(idx)
            
            comment_list = data["comments_list"]
            if len(comment_list) > 0:
                comment_list = [(com["user"]["login"], com["author_association"], com["body"]) for com in comment_list]
    
            messages = [
                {
                    "role": "system",
                    "content": global_issue_prompt
                },
                {
                    "role": "user",
                    "content": question_comment_prompt.format(
                        title, 
                        body
                        )
                }
            ]
            
            origin_labels.append(label)
    
            for user, author_association, body in comment_list:
                messages.append(
                    {
                        "role": "user",
                        "content": comment_prompt.format(
                            body
                            )
                    }
                )
                    
            all_messages.append(messages)
        
        responses = get_qwen_output(model, tokenizer, all_messages, max_tokens=4096)
        for idx, response in enumerate(responses):
            label = origin_labels[idx]
            cls_result = extract_json_from_string(response)
            
            # print(f"label: {label}")
            # print(f"response: {response}")
            # print(f"cls_result: {cls_result}")
    
            if cls_result.get("issue_type") and isinstance(cls_result.get("issue_type"), str):
                cls_result = cls_result.get("issue_type")
                pred_labels.append(cls_result)
            else:
                pred_labels.append("other")


    origin_labels = [x.lower() for x in origin_labels]
    pred_labels = [x.lower() for x in pred_labels]
    tmp_pred_labels = []
    for x in pred_labels:
        if x not in ["error", "performance", "deployment", "question", "other"]:
            x = 'other'
        tmp_pred_labels.append(x)
    pred_labels = tmp_pred_labels
    
    from sklearn.metrics import classification_report, matthews_corrcoef
    
    labels = ["error", "performance", "deployment", "question", "other"]
    label_map = {
        "error": 0, 
        "performance": 1,
        "deployment": 2,
        "question": 3,
        "other": 4
    }
    final_origin_labels_num = [label_map[l] for l in origin_labels]
    final_pred_labels_num = [label_map[l] for l in pred_labels]
    
    report = classification_report(final_origin_labels_num, final_pred_labels_num, target_names=labels)
    mcc = matthews_corrcoef(final_origin_labels_num, final_pred_labels_num)#ZY
    print("MCC:", mcc)#ZY

    print(report)
    report_dict = classification_report(final_origin_labels_num, final_pred_labels_num, target_names=labels, output_dict=True)
    # 将字典转为 DataFrame
    df_report = pd.DataFrame(report_dict).transpose()
    df_report["experiment_id"] = t + 1  # 添加实验编号
    df_report.index.name = "category"
    df_report.reset_index(inplace=True)
    
    all_reports.append(df_report)
    all_reports.append(pd.DataFrame({"category": [""]}))  # 添加空行

final_df = pd.concat(all_reports, ignore_index=True)
final_df.to_excel(result_path, index=False)

Processed prompts:  42%|████▏     | 808/1933 [17:49<23:32,  1.26s/it, est. speed input: 1140.05 toks/s, output: 56.61 toks/s]



Processed prompts:  69%|██████▏  | 1336/1933 [27:24<10:10,  1.02s/it, est. speed input: 1233.76 toks/s, output: 59.95 toks/s]



Processed prompts:  76%|██████▉  | 1477/1933 [29:38<07:58,  1.05s/it, est. speed input: 1268.83 toks/s, output: 61.49 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:44<00:00,  1.11s/it, est. speed input: 1389.06 toks/s, output: 70.39 toks/s]


MCC: 0.4994122231327932
              precision    recall  f1-score   support

       error       0.47      0.88      0.61       251
 performance       0.32      0.65      0.43        60
  deployment       0.36      0.75      0.48       165
    question       0.78      0.57      0.66       921
       other       0.88      0.53      0.66       536

    accuracy                           0.62      1933
   macro avg       0.56      0.67      0.57      1933
weighted avg       0.72      0.62      0.63      1933



Processed prompts:  42%|████▏     | 814/1933 [17:44<21:19,  1.14s/it, est. speed input: 1145.50 toks/s, output: 56.19 toks/s]



Processed prompts:  69%|██████▏  | 1341/1933 [27:23<10:26,  1.06s/it, est. speed input: 1234.37 toks/s, output: 59.68 toks/s]



Processed prompts:  77%|██████▉  | 1479/1933 [29:36<07:03,  1.07it/s, est. speed input: 1270.59 toks/s, output: 60.66 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:47<00:00,  1.11s/it, est. speed input: 1387.35 toks/s, output: 68.84 toks/s]


MCC: 0.4977815817862815
              precision    recall  f1-score   support

       error       0.46      0.85      0.59       251
 performance       0.33      0.68      0.45        60
  deployment       0.35      0.73      0.48       165
    question       0.78      0.59      0.67       921
       other       0.90      0.51      0.66       536

    accuracy                           0.62      1933
   macro avg       0.57      0.67      0.57      1933
weighted avg       0.72      0.62      0.63      1933



Processed prompts:  42%|████▏     | 807/1933 [17:44<23:37,  1.26s/it, est. speed input: 1143.49 toks/s, output: 55.41 toks/s]



Processed prompts:  69%|██████▏  | 1342/1933 [27:18<08:51,  1.11it/s, est. speed input: 1236.63 toks/s, output: 60.25 toks/s]



Processed prompts:  77%|██████▉  | 1485/1933 [29:38<10:43,  1.44s/it, est. speed input: 1269.93 toks/s, output: 61.31 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:32<00:00,  1.10s/it, est. speed input: 1397.19 toks/s, output: 69.95 toks/s]


MCC: 0.5046034576890172
              precision    recall  f1-score   support

       error       0.47      0.88      0.61       251
 performance       0.33      0.67      0.44        60
  deployment       0.37      0.77      0.50       165
    question       0.78      0.59      0.67       921
       other       0.89      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.68      0.57      1933
weighted avg       0.72      0.62      0.64      1933



Processed prompts:   1%|▏          | 22/1933 [03:23<3:47:32,  7.14s/it, est. speed input: 245.06 toks/s, output: 1.29 toks/s]



Processed prompts:  42%|████▏     | 816/1933 [17:47<22:31,  1.21s/it, est. speed input: 1136.02 toks/s, output: 55.63 toks/s]



Processed prompts:  69%|██████▏  | 1340/1933 [27:28<09:53,  1.00s/it, est. speed input: 1229.50 toks/s, output: 59.98 toks/s]



Processed prompts:  77%|██████▉  | 1484/1933 [29:47<06:13,  1.20it/s, est. speed input: 1262.83 toks/s, output: 61.38 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [36:02<00:00,  1.12s/it, est. speed input: 1377.81 toks/s, output: 69.60 toks/s]


MCC: 0.5020944275879793
              precision    recall  f1-score   support

       error       0.47      0.88      0.61       251
 performance       0.33      0.65      0.44        60
  deployment       0.37      0.73      0.49       165
    question       0.78      0.60      0.68       921
       other       0.88      0.51      0.64       536

    accuracy                           0.62      1933
   macro avg       0.56      0.67      0.57      1933
weighted avg       0.72      0.62      0.64      1933



Processed prompts:  42%|███▎    | 807/1933 [17:35<1:14:16,  3.96s/it, est. speed input: 1153.04 toks/s, output: 56.63 toks/s]



Processed prompts:  70%|██████▎  | 1347/1933 [27:10<09:31,  1.02it/s, est. speed input: 1245.79 toks/s, output: 61.12 toks/s]



Processed prompts:  77%|██████▉  | 1487/1933 [29:26<07:39,  1.03s/it, est. speed input: 1277.67 toks/s, output: 62.30 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:19<00:00,  1.10s/it, est. speed input: 1405.20 toks/s, output: 70.33 toks/s]


MCC: 0.5014125225316147
              precision    recall  f1-score   support

       error       0.47      0.87      0.61       251
 performance       0.33      0.63      0.43        60
  deployment       0.37      0.76      0.49       165
    question       0.78      0.59      0.67       921
       other       0.89      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.67      0.57      1933
weighted avg       0.72      0.62      0.64      1933



Processed prompts:  42%|████▏     | 804/1933 [17:38<30:09,  1.60s/it, est. speed input: 1146.65 toks/s, output: 54.92 toks/s]



Processed prompts:  69%|██████▏  | 1339/1933 [27:24<10:29,  1.06s/it, est. speed input: 1232.98 toks/s, output: 60.15 toks/s]



Processed prompts:  77%|██████▉  | 1480/1933 [29:39<07:24,  1.02it/s, est. speed input: 1267.10 toks/s, output: 61.36 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:25<00:00,  1.10s/it, est. speed input: 1401.28 toks/s, output: 69.40 toks/s]


MCC: 0.49392614559774495
              precision    recall  f1-score   support

       error       0.46      0.86      0.60       251
 performance       0.34      0.70      0.46        60
  deployment       0.36      0.73      0.48       165
    question       0.77      0.58      0.66       921
       other       0.89      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.56      0.68      0.57      1933
weighted avg       0.71      0.62      0.63      1933



Processed prompts:  42%|███▎    | 806/1933 [17:40<1:03:45,  3.39s/it, est. speed input: 1147.50 toks/s, output: 55.99 toks/s]



Processed prompts:  70%|██████▎  | 1344/1933 [27:15<08:55,  1.10it/s, est. speed input: 1240.77 toks/s, output: 60.25 toks/s]



Processed prompts:  77%|██████▉  | 1481/1933 [29:28<11:32,  1.53s/it, est. speed input: 1275.58 toks/s, output: 61.57 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:32<00:00,  1.10s/it, est. speed input: 1396.60 toks/s, output: 70.29 toks/s]


MCC: 0.49991211463176727
              precision    recall  f1-score   support

       error       0.47      0.87      0.61       251
 performance       0.34      0.67      0.45        60
  deployment       0.36      0.73      0.48       165
    question       0.77      0.59      0.67       921
       other       0.89      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.68      0.57      1933
weighted avg       0.72      0.62      0.63      1933



Processed prompts:  42%|████▏     | 804/1933 [17:45<24:50,  1.32s/it, est. speed input: 1138.99 toks/s, output: 55.29 toks/s]



Processed prompts:  69%|██████▏  | 1337/1933 [27:22<09:54,  1.00it/s, est. speed input: 1236.51 toks/s, output: 59.91 toks/s]



Processed prompts:  76%|██████▉  | 1478/1933 [29:39<07:59,  1.05s/it, est. speed input: 1269.12 toks/s, output: 61.29 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:41<00:00,  1.11s/it, est. speed input: 1390.74 toks/s, output: 69.83 toks/s]


MCC: 0.4950562575698578
              precision    recall  f1-score   support

       error       0.48      0.88      0.62       251
 performance       0.31      0.67      0.43        60
  deployment       0.36      0.75      0.48       165
    question       0.77      0.57      0.66       921
       other       0.88      0.51      0.65       536

    accuracy                           0.61      1933
   macro avg       0.56      0.68      0.57      1933
weighted avg       0.71      0.61      0.63      1933



Processed prompts:  42%|████▏     | 816/1933 [17:39<22:49,  1.23s/it, est. speed input: 1144.30 toks/s, output: 56.19 toks/s]



Processed prompts:  70%|██████▎  | 1344/1933 [27:23<09:37,  1.02it/s, est. speed input: 1235.57 toks/s, output: 60.31 toks/s]



Processed prompts:  77%|██████▉  | 1481/1933 [29:32<06:54,  1.09it/s, est. speed input: 1271.03 toks/s, output: 61.39 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:55<00:00,  1.11s/it, est. speed input: 1382.28 toks/s, output: 70.18 toks/s]


MCC: 0.5051055305455062
              precision    recall  f1-score   support

       error       0.48      0.87      0.62       251
 performance       0.33      0.67      0.44        60
  deployment       0.36      0.75      0.49       165
    question       0.78      0.59      0.67       921
       other       0.89      0.52      0.66       536

    accuracy                           0.62      1933
   macro avg       0.57      0.68      0.58      1933
weighted avg       0.72      0.62      0.64      1933



Processed prompts:  42%|████▏     | 805/1933 [17:46<25:22,  1.35s/it, est. speed input: 1137.85 toks/s, output: 55.47 toks/s]



Processed prompts:  69%|██████▏  | 1336/1933 [27:25<12:25,  1.25s/it, est. speed input: 1233.53 toks/s, output: 60.08 toks/s]



Processed prompts:  77%|██████▉  | 1479/1933 [29:40<07:08,  1.06it/s, est. speed input: 1267.97 toks/s, output: 61.15 toks/s]



Processed prompts: 100%|█████████| 1933/1933 [35:37<00:00,  1.11s/it, est. speed input: 1393.37 toks/s, output: 69.54 toks/s]


MCC: 0.4991873051465983
              precision    recall  f1-score   support

       error       0.47      0.88      0.61       251
 performance       0.33      0.68      0.44        60
  deployment       0.36      0.73      0.48       165
    question       0.77      0.58      0.66       921
       other       0.90      0.51      0.65       536

    accuracy                           0.62      1933
   macro avg       0.57      0.68      0.57      1933
weighted avg       0.72      0.62      0.63      1933



In [9]:
final_df = pd.concat(all_reports, ignore_index=True)
final_df.to_excel(result_path, index=False)

In [10]:
result_path

'/mnt/cc6bfa99-761f-4d73-a77a-b5dc047a5d3a/zhaoyu/VSCode/LLM/qwen2.5_14b_prompt.xlsx'

In [None]:
import gc
import torch

# 假设你的 vllm 模型对象是 `model`
del model  # 删除模型对象
torch.cuda.empty_cache()  # 清空 GPU 的缓存
gc.collect()  # 强制进行垃圾回收

import sys
# 删除 `vllm` 和相关依赖
del sys.modules["vllm"]
torch.cuda.empty_cache()
gc.collect()

INFO 04-09 01:19:52 multiproc_worker_utils.py:133] Terminating local vLLM worker processes


0

: 

In [27]:
# all question类别打标之后的
from collections import Counter
print(Counter(pred_labels))
origin_labels = [x.lower() for x in origin_labels]
pred_labels = [x.lower() for x in pred_labels]
tmp_pred_labels = []
for x in pred_labels:
    if x not in ["error", "performance", "deployment", "question", "other"]:
        x = 'other'
    tmp_pred_labels.append(x)
pred_labels = tmp_pred_labels

from sklearn.metrics import classification_report

labels = ["error", "performance", "deployment", "question", "other"]
label_map = {
    "error": 0, 
    "performance": 1,
    "deployment": 2,
    "question": 3,
    "other": 4
}
final_origin_labels_num = [label_map[l] for l in origin_labels]
final_pred_labels_num = [label_map[l] for l in pred_labels]

report = classification_report(final_origin_labels_num, final_pred_labels_num, target_names=labels)
print(report)

Counter({'question': 1030, 'error': 322, 'other': 298, 'deployment': 242, 'performance': 37, 'feature request': 2, 'feature_request': 2})
              precision    recall  f1-score   support

       error       0.53      0.68      0.59       251
 performance       0.43      0.27      0.33        60
  deployment       0.42      0.62      0.50       165
    question       0.69      0.77      0.73       921
       other       0.79      0.44      0.57       536

    accuracy                           0.64      1933
   macro avg       0.57      0.55      0.54      1933
weighted avg       0.66      0.64      0.63      1933

