In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import os


def show_info(data):

    review_sizes = []
    for label, review in data.to_numpy().tolist():
        review_sizes.append(len(review))

    print('最大长度:', max(review_sizes))
    print('最小长度:', min(review_sizes))
    print('平均长度:', int(sum(review_sizes) / len(review_sizes)))
    print('-' * 50)


def demo():
    # data = pd.read_csv('ChnSentiCorp_htl_8k/ChnSentiCorp_htl_8k.csv')
    data = pd.read_csv('hf://datasets/dirtycomputer/weibo_senti_100k/weibo_senti_100k.csv')
    data['label'] = np.where(data['label'] == 1, '好评', '差评')

    print('数据标签分布:', Counter(data['label']))
    print('-' * 50)

    # 去掉太长的评论
    data = data[data['review'].apply(lambda x: len(x) > 10 and len(x) < 300)]
    show_info(data)

    # 原始数数据分割
    train_data, test_data  = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

    print('原始训练集数量:', train_data.shape)
    print('原始测试集数量:', test_data.shape)
    print('-' * 50)

    # 采样部分数据
    sample_num = 5000
    train_data = train_data.sample(int(sample_num * 0.8), random_state=42)
    test_data  = test_data.sample(int(sample_num * 0.2),  random_state=52)

    print('最终训练集数量:', train_data.shape)
    print('最终测试集数量:', test_data.shape)

    # 数据转换字典
    train_data = train_data.to_dict(orient='records')
    test_data  = test_data.to_dict(orient='records')

    # Create the directory if it doesn't exist
    os.makedirs('./weibo_senti_100k', exist_ok=True)

    # 数据本地存储
    pickle.dump(train_data, open('./weibo_senti_100k/01-训练集.pkl', 'wb'))
    pickle.dump(test_data,  open('./weibo_senti_100k/02-测试集.pkl', 'wb'))


if __name__ == '__main__':
    demo()

In [None]:
import torch
import time
import pickle
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    Qwen2Tokenizer,
    Qwen2ForCausalLM,
)
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_dataset(tokenizer):
    comm_data = pickle.load(open('./weibo_senti_100k/01-训练集.pkl', 'rb'))
    result_data = []
    for data in comm_data:
        message = [
            {'role': 'system', 'content': (
                '你是一个专业的情感分类助手。你的任务是对输入的文本进行情感分析，'
                '判断其情感倾向并输出 "好评" 或 "差评" 两个词之一，不要输出任何其他额外的信息或解释。'
            )},
            {'role': 'user', 'content': data['review']},
            {'role': 'assistant', 'content': data['label']}
        ]
        inputs = tokenizer.apply_chat_template(
            message,
            add_generation_prompt=False,
            tokenize=True
        )
        result_data.append(inputs)
    return result_data

def demo():
    # 1. 加载模型与分词器
    estimator: Qwen2ForCausalLM = AutoModelForCausalLM.from_pretrained(
        'Qwen/Qwen2.5-0.5B-Instruct'
    ).to(device)
    tokenizer: Qwen2Tokenizer = AutoTokenizer.from_pretrained(
        'Qwen/Qwen2.5-0.5B-Instruct'
    )

    # 2. 构造 TrainingArguments：每 100 步记录一次
    arguments = TrainingArguments(
        output_dir='Qwen2.5-0.5B-Instruct-SFT',
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=5,
        learning_rate=2e-5,
        optim='adamw_torch',
        eval_strategy='no',
        save_strategy='epoch',
        save_total_limit=5,
        load_best_model_at_end=False,
        fp16=True,

        # 日志设置：每 100 步记录一次
        logging_strategy='steps',
        logging_steps=100,
        logging_dir='./logs',
    )

    # 3. 准备数据与 Trainer
    train_data = get_dataset(tokenizer)
    trainer = Trainer(
        model=estimator,
        train_dataset=train_data,
        args=arguments,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    # 4. 重置显存峰值 & 计时
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats(device)
    start_time = time.time()

    # 5. 开始训练
    trainer.train()

    # 6. 训练结束，输出耗时 & 显存峰值
    elapsed = time.time() - start_time
    print(f"训练总时长: {elapsed:.2f} 秒")
    if torch.cuda.is_available():
        peak_mem = torch.cuda.max_memory_allocated(device) / (1024 ** 3)
        print(f"显存峰值使用: {peak_mem:.2f} GB")

if __name__ == '__main__':
    torch.cuda.empty_cache()
    demo()


In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Qwen2Tokenizer
from transformers import Qwen2ForCausalLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def demo():
    model_path = 'Qwen2.5-0.5B-Instruct-SFT/checkpoint-2500'
    estimator : Qwen2ForCausalLM= AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer : Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')

    system = '你是一个专业的情感分类专家，请对以下文本进行情感分类，并输出 "好评" 或 "差评" 两个词之一。'

    while True:
        comment = input('请输入评论内容:')
        message = [{'role': 'system', 'content': system}, {'role': 'user', 'content': comment}]
        inputs = tokenizer.apply_chat_template(message,
                                               add_generation_prompt=True,
                                               tokenize=True,
                                               return_tensors='pt',
                                               return_dict=True).to(device)
        inputs_length = len(inputs['input_ids'][0])
        with torch.no_grad():
            outputs = estimator.generate(**inputs, max_length=512)
        output = outputs[0]
        y_pred = tokenizer.decode(output[inputs_length:], skip_special_tokens=True).strip()
        print('预测标签:', y_pred)
        print('-' * 50)


if __name__ == '__main__':
    demo()

In [None]:
import numpy as np
import pickle
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Qwen2Tokenizer
from transformers import Qwen2ForCausalLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

def evaluate(model_path):
    # 模型和分词器加载
    estimator: Qwen2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer: Qwen2Tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct', padding_side='left')
    # 加载测试集
    test_data = pickle.load(open('weibo_senti_100k/02-测试集.pkl', 'rb'))

    # 数据加载器
    system = '你是一个专业的情感分类专家，请对以下文本进行情感分类，并输出 "好评" 或 "差评" 两个词之一。'
    def collate_fn(batch_data):
        inputs, labels = [], []
        for data in batch_data:
            message = [{'role': 'system', 'content': system}, {'role': 'user', 'content': data['review']}]
            inputs.append(message)
            labels.append(data['label'])

        inputs = tokenizer.apply_chat_template(inputs,
                                               add_generation_prompt=True,
                                               tokenize=True,
                                               return_tensors='pt',
                                               padding=True,
                                               return_dict=True)

        inputs = { k: v.to(device) for k, v in inputs.items() }
        return inputs, labels

    dataloader = DataLoader(test_data, batch_size=8, shuffle=True, collate_fn=collate_fn)


    # 预测评估
    true_labels, pred_labels, wrong = [], [], 0
    description = '评估-输出错误: %d'
    progress = tqdm(range(len(dataloader)), desc=description % wrong)
    for inputs, labels in dataloader:
        with torch.no_grad():
            outputs = estimator.generate(**inputs, max_length=512)
        progress.update()

        # 输出解码
        for output, input, y_true in zip(outputs, inputs['input_ids'], labels):
            y_pred = tokenizer.decode(output[len(input):], skip_special_tokens=True).strip()
            if y_pred not in ['好评', '差评']:
                wrong += 1
                progress.set_description(description % wrong)
                continue

            pred_labels.append(y_pred)
            true_labels.append(y_true)

    progress.close()

    return np.sum(np.array(true_labels) == np.array(pred_labels)) / len(true_labels)


def demo():
    model_path = 'Qwen/Qwen2.5-0.5B-Instruct'
    acc = evaluate(model_path)
    print('模型微调前: %.3f' % acc)

    model_path = 'Qwen2.5-0.5B-Instruct-SFT/checkpoint-2500'
    acc = evaluate(model_path)
    print('模型微调后: %.3f' % acc)


if __name__ == '__main__':
    demo()