# 简单二分+tfidf，中文模型

In [1]:
# 0. 导入库
import json
import os
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


# 1. 读取数据集路径
human_path = "face2_zh_json/human/zh_unicode"
llm_path = "face2_zh_json/generated/zh_qwen2"

human_files = [os.path.join(human_path, f) for f in os.listdir(human_path)]
llm_files = [os.path.join(llm_path, f) for f in os.listdir(llm_path)]

print("✅ 人工文件:", human_files[:3])
print("✅ 千问文件:", llm_files[:3])

✅ 人工文件: ['face2_zh_json/human/zh_unicode\\news-zh.json', 'face2_zh_json/human/zh_unicode\\webnovel.json', 'face2_zh_json/human/zh_unicode\\wiki-zh.json']
✅ 千问文件: ['face2_zh_json/generated/zh_qwen2\\news-zh.qwen2-72b-base.json', 'face2_zh_json/generated/zh_qwen2\\webnovel.qwen2-72b-base.json', 'face2_zh_json/generated/zh_qwen2\\wiki-zh.qwen2-72b-base.json']


In [2]:
# 2. 加载人工数据
def load_human_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df["label"] = 0  # 人工标记为0
    return df[["output", "label"]]

human_df = pd.concat([load_human_data(f) for f in human_files], ignore_index=True)
print("👀 人工数据样例：")
print(human_df.head(3))

👀 人工数据样例：
                                              output  label
0  补贴后，变成了以积分、抽奖等形式为主的“暗补”。一组公开的数据显示，停补后的“滴滴打车”日均...      0
1  培训、投融资等方面有着巨大的合作空间，为积极推动我省与丹麦的友好交流，促进双边经贸投资合作，...      0
2                               环结你喜欢哪种呢？觉得不错，请点赞↓↓↓      0


In [3]:
# 3. 加载千问数据
def load_llm_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    outputs = [{"output": v, "label": 1} for k, v in data["output"].items()]
    return pd.DataFrame(outputs)

llm_df = pd.concat([load_llm_data(f) for f in llm_files], ignore_index=True)
print("🤖 千问生成数据样例：")
print(llm_df.head(3))

🤖 千问生成数据样例：
                                              output  label
0  补贴主要针对司机端。\n记者昨日从快的打车获悉，针对司机端的补贴将在今天正式实施：早上7点至...      1
1  合作以及教育交流等方面有着广阔的合作空间，5月20日，省委统战部、省环保厅和省外事办在贵阳联...      1
2  环绕希望对大家有用！\n这9个基本功，99%的家长不会教孩子！\n这9个基本功，99%的家长...      1


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# 1. 载入数据
all_df = pd.concat([human_df, llm_df], ignore_index=True)
dataset = Dataset.from_pandas(all_df.rename(columns={"output": "text"}))

In [7]:
model_name = "bert-base-chinese"
#model_name = "hfl/chinese-bert-wwm-ext"         # 比较大
# 2. 数据预处理：分词与向量化
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# 使用map方法进行批处理分词
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [8]:
# 3. 切分训练集和验证集
train_dataset, eval_dataset = dataset.train_test_split(test_size=0.1).values()

# 4. 设置模型和训练参数
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 5. 设置 Trainer 参数
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # 修改这里
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    do_train=True,
    do_eval=True,
    report_to="none",  # 关闭日志报告（如 TensorBoard）
    dataloader_num_workers=4,  # 多线程加载数据，提高数据加载效率
    fp16=False,  # CPU 禁用混合精度
    save_total_limit=2,  # 最多保存 2 个 checkpoint
    load_best_model_at_end=True,  # 加载最优模型
    metric_for_best_model="eval_loss",  # 根据评估 loss 判断最优模型
    greater_is_better=False
)
# 6. Trainer 训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [11]:
# 训练模型
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5027,0.544023
2,0.4059,1.017501
3,0.2172,1.438685




TrainOutput(global_step=10125, training_loss=0.37735411377895023, metrics={'train_runtime': 27680.4464, 'train_samples_per_second': 2.926, 'train_steps_per_second': 0.366, 'total_flos': 5334839758479360.0, 'train_loss': 0.37735411377895023, 'epoch': 3.0})

In [12]:
# ✅ 保存最终模型和 tokenizer
save_path = "./saved_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ 模型已保存到 {save_path}")

# 9. 评估模型
results = trainer.evaluate()
print("评估结果：")
print(results)

# 10. 预测并输出分类报告
predictions = trainer.predict(eval_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)
from sklearn.metrics import classification_report, roc_auc_score
print("分类报告：")
print(classification_report(eval_dataset["label"], pred_labels))

# 计算 AUC
roc_auc = roc_auc_score(eval_dataset["label"], predictions.predictions[:, 1])
print(f"AUC 分数: {roc_auc:.4f}")

✅ 模型已保存到 ./saved_model




评估结果：
{'eval_loss': 0.5440232157707214, 'eval_runtime': 255.0984, 'eval_samples_per_second': 11.76, 'eval_steps_per_second': 1.47, 'epoch': 3.0}
分类报告：
              precision    recall  f1-score   support

           0       0.88      0.50      0.64      1534
           1       0.64      0.93      0.76      1466

    accuracy                           0.71      3000
   macro avg       0.76      0.71      0.70      3000
weighted avg       0.76      0.71      0.70      3000

AUC 分数: 0.8447


In [1]:
import os
import pandas as pd
import random
from sklearn.metrics import classification_report, roc_auc_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import Dataset
import torch

# 2. Load all .txt files
def load_texts_from_folder(folder_path):
    texts = []
    file_list = [f for f in os.listdir(folder_path) if f.endswith(".txt")]  # Only keep .txt files
    file_list = sorted(file_list, key=lambda x: int(x.split(".")[0]))  # Sort by the numeric part of filename
    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            texts.append(f.read())
    return texts

human_texts = load_texts_from_folder("ghostbuster-data/essay/human")
gpt_texts = load_texts_from_folder("ghostbuster-data/essay/gpt")

# Show a few samples
print("Sample human text:\n", human_texts[999][:500])
print("\nSample GPT text:\n", gpt_texts[0][:500])

Sample human text:
 Supervision is a process of knowledge exchange, social experience, and psychological support received by trainees in work, career, and professional development. It includes informal communication, usually between two people, over a long period, between an employee who has a large amount of relevant knowledge, wisdom, or experience, and an employee or student who has these qualities to a lesser extent. In this regard, the supervisors must have particular traits and specific demeanor to succeed in

Sample GPT text:
 Introduction:
The film "12 Years a Slave" serves as a poignant depiction of the harrowing realities of slavery, shedding light on themes of collectivism and individualism. Through its exploration of these themes, the movie effectively portrays slavery as a widespread issue with far-reaching consequences. Moreover, it vividly portrays instances of prejudice, generalizations, stereotyping, and discrimination against black people, showcasing their profound im

In [2]:
# 2. 构建 DataFrame
texts = human_texts + gpt_texts
labels = [0] * len(human_texts) + [1] * len(gpt_texts)  # 0: Human, 1: GPT
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)
df = pd.DataFrame({"text": texts, "label": labels})

In [3]:
# 3. 转换为 HuggingFace Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [13]:

# 4. 加载 tokenizer 和模型
model_name = "bert-base-uncased"  # 可替换为更强的如 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/27000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [5]:
# 5. 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 6. 设置 TrainingArguments（自动保存模型）
training_args = TrainingArguments(
    output_dir="./bert-eng-results",
    eval_strategy="epoch",
    save_strategy="epoch",  # 自动保存最佳模型
    save_total_limit=2,     # 最多保留2个模型快照
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./bert-eng-logs",
    logging_steps=50,
    do_train=True,
    do_eval=True,
    report_to="none",
    dataloader_num_workers=4,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # 根据最小loss保存最佳模型
    greater_is_better=False,
)

# 7. 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [7]:
# 8. 训练模型
trainer.train()

# 9. 评估模型
results = trainer.evaluate()
print("评估结果：")
print(results)

# 10. 预测与分类报告
predictions = trainer.predict(eval_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)
print("分类报告：")
print(classification_report(eval_dataset["label"], pred_labels))
roc_auc = roc_auc_score(eval_dataset["label"], predictions.predictions[:, 1])
print(f"AUC 分数: {roc_auc:.4f}")



Epoch,Training Loss,Validation Loss
1,0.0911,0.013253
2,0.0006,0.054682
3,0.0003,0.037735




评估结果：
{'eval_loss': 0.013252943754196167, 'eval_runtime': 162.4564, 'eval_samples_per_second': 2.462, 'eval_steps_per_second': 0.308, 'epoch': 3.0}
分类报告：
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      0.99      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

AUC 分数: 1.0000
