# 1. 前期准备

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import trange

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM


device = "cuda"  # the device to load the model onto
model_path = "../.cache/modelscope/hub/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, device_map="auto", trust_remote_code=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
CUDA extension not installed.
CUDA extension not installed.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model.dtype

torch.float16

In [4]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=32,  # Lora alaph
    lora_dropout=0.1,  # Dropout 比例
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'gate_proj', 'k_proj', 'q_proj', 'o_proj', 'down_proj', 'v_proj', 'up_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [5]:
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 19,988,480 || all params: 1,264,914,432 || trainable%: 1.580223886638365


# 2. 导入数据

In [6]:
test_dataset = pd.read_csv("./data/test_dataset.csv")
test_dataset.head()

Unnamed: 0,joke,label,len_joke,标签,pinyin
0,车上遇见一对双胞胎姑娘。穿的那个清凉啊，笑的那个甜啊……忍不住要去搭讪。 为不表现的轻浮和唐...,2,83,强幽默,che1 shang4 yu4 jian4 yi1 dui4 shuang1 bao1 ta...
1,图书馆自习室每个桌面上都摆满了占座的书，很是让其他人气愤。 某生不占座没有固定的座位，一日去...,0,192,弱幽默,tu2 shu1 guan3 zi4 xi2 shi4 mei3 ge4 zhuo1 mia...
2,一个报童在大街上高声叫卖：骇人听闻的诈骗案，受害者多达82人！ 某行人连忙上前买一份。可是，...,2,102,强幽默,yi1 ge4 bao4 tong2 zai4 da4 jie1 shang4 gao1 s...
3,法官对被告说：你不但偷钱，还拿了表，戒指和珍珠。 被告说：是的，法官先生，人们不是常说‘光有...,2,56,强幽默,fa3 guan1 dui4 bei4 gao4 shuo1 ： ni3 bu4 dan4 ...
4,大人，原告在法庭上说，这个人同我一起生活了几天，答应同我结婚，可是后来他同别的女人结了婚。他...,1,140,一般幽默,da4 ren2 ， yuan2 gao4 zai4 fa3 ting2 shang4 sh...


In [7]:
content_list = test_dataset["joke"].to_list()

# 3. 生成回答

In [8]:
def generate_response(system_content, content):
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": content},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=256)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [9]:
responses = []
max_retries = 3
system_content = "你是一位语言学家，请你判断以下文本的幽默程度，输出分类只能是'弱幽默','一般幽默','强幽默'三者之一，不能包含其它符号。"

In [10]:
def detect_humor_level(text):
    humor_levels = ["弱幽默", "一般幽默", "强幽默"]
    detected_levels = []

    for level in humor_levels:
        if level in text:
            detected_levels.append(level)

    if len(detected_levels) == 1:
        return detected_levels[0]
    elif len(detected_levels) > 1:
        return "警告：同时拥有两种或以上幽默水平"
    else:
        return "警告：未检测到幽默水平"


# 示例用法
text = "这个笑话真的很弱幽默，但又有一些一般幽默的味道。"
print(detect_humor_level(text))

警告：同时拥有两种或以上幽默水平


In [None]:
for i in trange(len(content_list)):
    retries = 0
    while retries < max_retries:
        response = generate_response(system_content, content_list[i])
        if response in ["弱幽默", "一般幽默", "强幽默"]:
            responses.append(response)
            break
        else:
            response_tmp = detect_humor_level(response)
            if response_tmp in ["弱幽默", "一般幽默", "强幽默"]:
                responses.append(response_tmp)
                break
            else:
                retries += 1
    else:
        print(f"回答{i}生成错误次数超过上限！")

In [12]:
test_dataset["预测标签"] = responses
test_dataset["预测标签"].value_counts()

预测标签
强幽默     2324
弱幽默      732
一般幽默      74
Name: count, dtype: int64

In [13]:
test_dataset["pred_label"] = test_dataset["预测标签"].replace(
    {"弱幽默": 0, "一般幽默": 1, "强幽默": 2}
)

  test_dataset["pred_label"] = test_dataset["预测标签"].replace(


In [14]:
from sklearn import metrics


def evaluate(targets, outputs):
    conf = metrics.confusion_matrix(targets, outputs)
    data = pd.DataFrame(
        conf,
        columns=["weak humor", "general humor", "strong humor"],
        index=["weak humor", "general humor", "strong humor"],
    )
    plt.figure(figsize=(6, 5))
    sns.heatmap(data, cmap=plt.cm.Blues, annot=True, fmt="d")
    plt.xlabel("Predict label")
    plt.ylabel("True label")
    plt.title("Confusion Matrix", fontsize=16)
    plt.show()
    print("Classification Report")
    print(metrics.classification_report(targets, outputs))

In [None]:
evaluate(test_dataset["label"], test_dataset["pred_label"])

# 4. 保存回答

In [17]:
data = pd.read_csv("./data/prediction.csv")
data["zero-shot"] = test_dataset["pred_label"]
data.to_csv("./data/prediction.csv", index=False)

In [90]:
test_dataset[["label", "预测标签", "pred_label"]].to_csv(
    "./data/test_prediction_qwen_zeroshot.csv", index=False
)