# 简单二分+tfidf，中文模型

In [39]:
# 0. 导入库
import json
import os
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# 1. 读取数据集路径
human_path = "face2_zh_json/human/zh_unicode"
llm_path = "face2_zh_json/generated/zh_qwen2"

human_files = [os.path.join(human_path, f) for f in os.listdir(human_path)]
llm_files = [os.path.join(llm_path, f) for f in os.listdir(llm_path)]

print("✅ 人工文件:", human_files[:3])
print("✅ 千问文件:", llm_files[:3])

✅ 人工文件: ['face2_zh_json/human/zh_unicode\\news-zh.json', 'face2_zh_json/human/zh_unicode\\webnovel.json', 'face2_zh_json/human/zh_unicode\\wiki-zh.json']
✅ 千问文件: ['face2_zh_json/generated/zh_qwen2\\news-zh.qwen2-72b-base.json', 'face2_zh_json/generated/zh_qwen2\\webnovel.qwen2-72b-base.json', 'face2_zh_json/generated/zh_qwen2\\wiki-zh.qwen2-72b-base.json']


In [40]:
# 2. 加载人工数据
def load_human_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df["label"] = 0  # 人工标记为0
    return df[["output", "label"]]

human_df = pd.concat([load_human_data(f) for f in human_files], ignore_index=True)
print("👀 人工数据样例：")
print(human_df.head(3))

👀 人工数据样例：
                                              output  label
0  补贴后，变成了以积分、抽奖等形式为主的“暗补”。一组公开的数据显示，停补后的“滴滴打车”日均...      0
1  培训、投融资等方面有着巨大的合作空间，为积极推动我省与丹麦的友好交流，促进双边经贸投资合作，...      0
2                               环结你喜欢哪种呢？觉得不错，请点赞↓↓↓      0


In [41]:
# 3. 加载千问数据
def load_llm_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    outputs = [{"output": v, "label": 1} for k, v in data["output"].items()]
    return pd.DataFrame(outputs)

llm_df = pd.concat([load_llm_data(f) for f in llm_files], ignore_index=True)
print("🤖 千问生成数据样例：")
print(llm_df.head(3))

🤖 千问生成数据样例：
                                              output  label
0  补贴主要针对司机端。\n记者昨日从快的打车获悉，针对司机端的补贴将在今天正式实施：早上7点至...      1
1  合作以及教育交流等方面有着广阔的合作空间，5月20日，省委统战部、省环保厅和省外事办在贵阳联...      1
2  环绕希望对大家有用！\n这9个基本功，99%的家长不会教孩子！\n这9个基本功，99%的家长...      1


In [42]:
# 4. 合并 & 打乱数据
all_data = pd.concat([human_df, llm_df], ignore_index=True).sample(frac=1, random_state=42)
print("📦 合并后数据样例：")
print(all_data.head(3))

📦 合并后数据样例：
                                                  output  label
2308   加盟店肯定能为您带来丰盛的财富。三只松鼠可以开实体店吗？ 三只松鼠零食店加盟品牌总部经心致力...      0
22404  牙雕、花板窗格、古玩字画八百万……\n将五千年的华夏文明，装进自己的博物馆！\n一个香港少年...      1
23397  内敛，心机深沉的大学生。是命运的安排，还是巧合，让他与前世的爱人擦肩而过，与神秘的女人有了剪...      1


In [43]:
# 5. 中文分词（用jieba）+ 合并成新文本列
def jieba_cut(text):
    return " ".join(jieba.cut(text))

all_data["cut_output"] = all_data["output"].apply(jieba_cut)
print("✂️ 分词后样例：")
print(all_data["cut_output"].head(3))

✂️ 分词后样例：
2308     加盟店 肯定 能 为 您 带来 丰盛 的 财富 。 三只 松鼠 可以 开 实体店 吗 ？  ...
22404    牙雕 、 花板 窗格 、 古玩 字画 八百万 … … \n 将 五千年 的 华夏 文明 ， ...
23397    内敛 ， 心机 深沉 的 大学生 。 是 命运 的 安排 ， 还是 巧合 ， 让 他 与 前...
Name: cut_output, dtype: object


In [44]:
# 6. 特征提取（TF-IDF）
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(all_data["cut_output"])
y = all_data["label"]

print("🎯 向量维度：", X.shape)

🎯 向量维度： (30000, 5000)


In [45]:
# 7. 训练逻辑回归模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [46]:
# 8. 模型评估
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print("📊 分类报告：")
print(classification_report(y_test, y_pred))

print("📈 AUC: ", roc_auc_score(y_test, y_prob))

📊 分类报告：
              precision    recall  f1-score   support

           0       0.68      0.61      0.64      3043
           1       0.64      0.71      0.67      2957

    accuracy                           0.66      6000
   macro avg       0.66      0.66      0.65      6000
weighted avg       0.66      0.66      0.65      6000

📈 AUC:  0.7678437492324811


In [47]:
# 9. 保存 TF-IDF 和 模型
import joblib

model_dir = "saved_model"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(vectorizer, os.path.join(model_dir, "tfidf_vectorizer1.joblib"))
joblib.dump(clf, os.path.join(model_dir, "logistic_model1.joblib"))

print("✅ 模型和向量器已保存！")

✅ 模型和向量器已保存！


In [48]:
# 10. 加载模型并预测新文本是否为 AI 生成
def predict_text_is_ai(text):
    # 加载模型和向量器
    vectorizer = joblib.load(os.path.join(model_dir, "tfidf_vectorizer1.joblib"))
    clf = joblib.load(os.path.join(model_dir, "logistic_model1.joblib"))

    # 分词 + 向量化
    cut = " ".join(jieba.cut(text))
    X_new = vectorizer.transform([cut])

    # 预测
    prob = clf.predict_proba(X_new)[0][1]
    label = clf.predict(X_new)[0]
    
    result = "🤖 AI生成（label=1）" if label == 1 else "🧑 人工写作（label=0）"
    print(f"🔍 预测结果：{result}（置信度={prob:.4f}）")

# 示例文本
test_text = "韦里安纳大学获得一个通讯和新闻系学士学位，并于哈佛大学约翰·F·甘迺迪政府学院获得公共管理系硕士学位。她在那时候亦是一位梅森学者。\n\n卡瓦列罗于16岁时已经开始其记者生涯。当时，她开始为波哥大报纸《共和国报》（La República）工作。卡瓦列罗指出，她作为一个在「大男子主义」社会中工作的女记者，起初她经常获分配较难的工作。但是，她指出自己渐渐懂得享受记者的工作，并开始变成一个工作狂。\n\n之后，她转往哥伦比亚最大的报纸《时间》（El Tiempo）担任调查编辑一职，并于新闻杂志《改变》（Cambio）兼职。从1998年到2001年，她转往周刊《一周》（Semana）担任调查主任一职。 她写的文章亦曾出现于《纽约时报》、《新闻周刊》、CNN、《波士顿环球报》、《迈阿密先驱报》、以及《国际先驱论坛报》。《哥伦比亚新闻评论》和《尼曼报告》亦曾刊登其文章。卡瓦列罗指出至少有一打哥伦比亚政客因其报道而入狱。\n\n于1997年，她获允许访问哥伦比亚右派准军事部队领袖之一卡洛斯·卡斯塔诺·吉尔。卡瓦列罗乘坐了8小时的马方能到达卡斯塔诺的藏身之地。在采访中，卡斯塔诺第一次表示自己有意进行和谈。 \n\n于1999年，卡瓦列罗多次收到死亡威胁。而且，当时亦有一位持枪的保安人员在其居所外等候她。在哈佛大学约翰·F·甘迺迪政府学院院长约瑟夫·奈尔的邀请下，她离开了哥伦比亚，并移民至美国麻萨诸塞州剑桥。\n\n于1993年，卡瓦列罗获《时代》华盛顿特区分部承认为阿尔弗雷德友好出版社资深会员。于1997年，她获哈佛大学承认为尼曼资深会员。于1991年，她因报道政府腐败而获颁哥伦比亚西蒙玻利瓦尔国家新闻奖。于1998年，她因访问卡斯塔诺而再次获颁此奖项。\n\n于1990年，美洲报业协会颁予卡瓦列罗人权奖。于1999年11月，她获保护记者委员会颁发国际新闻自由奖。\n"
predict_text_is_ai(test_text)

🔍 预测结果：🧑 人工写作（label=0）（置信度=0.3283）


# English

In [57]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import random

# 2. Load all .txt files
def load_texts_from_folder(folder_path):
    texts = []
    file_list = [f for f in os.listdir(folder_path) if f.endswith(".txt")]  # Only keep .txt files
    file_list = sorted(file_list, key=lambda x: int(x.split(".")[0]))  # Sort by the numeric part of filename
    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            texts.append(f.read())
    return texts

human_texts = load_texts_from_folder("ghostbuster-data/essay/human")
gpt_texts = load_texts_from_folder("ghostbuster-data/essay/gpt")

# Show a few samples
print("Sample human text:\n", human_texts[999][:500])
print("\nSample GPT text:\n", gpt_texts[0][:500])

Sample human text:
 Supervision is a process of knowledge exchange, social experience, and psychological support received by trainees in work, career, and professional development. It includes informal communication, usually between two people, over a long period, between an employee who has a large amount of relevant knowledge, wisdom, or experience, and an employee or student who has these qualities to a lesser extent. In this regard, the supervisors must have particular traits and specific demeanor to succeed in

Sample GPT text:
 Introduction:
The film "12 Years a Slave" serves as a poignant depiction of the harrowing realities of slavery, shedding light on themes of collectivism and individualism. Through its exploration of these themes, the movie effectively portrays slavery as a widespread issue with far-reaching consequences. Moreover, it vividly portrays instances of prejudice, generalizations, stereotyping, and discrimination against black people, showcasing their profound im

In [58]:
# 3. Build Dataset
texts = human_texts + gpt_texts
labels = ["Human"] * len(human_texts) + ["GPT"] * len(gpt_texts)
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)


df = pd.DataFrame({"text": texts, "label": labels})
print("\nDataFrame sample:")
print(df.sample(5))


DataFrame sample:
                                                   text  label
1297  Introduction\nEuthanasia, the intentional act ...    GPT
765   Defining a historical context of juvenile deli...  Human
243   The field of nursing has evolved significantly...    GPT
656   Harlem Renaissance was a period in America bet...  Human
1971  Introduction\nThe design of a Sterile Processi...    GPT


In [59]:
# 4. Split Train/Test (Stratified & Shuffled)
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# 5. TF-IDF Vectorization (fit only on training)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,1))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [60]:
# 6. Train Classifier
clf = LogisticRegression(max_iter=1000, solver='liblinear')
clf.fit(X_train_tfidf, y_train)

# 7. Evaluate
y_pred = clf.predict(X_test_tfidf)
print("Train accuracy:", clf.score(X_train_tfidf, y_train))  # 如果远高于测试集，说明过拟合
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Human", "GPT"]))

Train accuracy: 0.995625

Classification Report:
              precision    recall  f1-score   support

       Human       0.96      0.96      0.96       200
         GPT       0.96      0.96      0.96       200

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400



In [62]:
import joblib
import os

model_dir = "saved_model_en"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(vectorizer, os.path.join(model_dir, "tfidf_vectorizer_en.joblib"))
joblib.dump(clf, os.path.join(model_dir, "logistic_model_en.joblib"))

print("✅ Model and vectorizer saved!")

✅ Model and vectorizer saved!


In [64]:
def predict_text_is_ai_en(text):
    import joblib
    import os

    # Load model and vectorizer
    model_dir = "saved_model_en"
    vectorizer = joblib.load(os.path.join(model_dir, "tfidf_vectorizer_en.joblib"))
    clf = joblib.load(os.path.join(model_dir, "logistic_model_en.joblib"))

    # Preprocess (lowercase and simple cleanup)
    text_clean = text.lower()

    # Vectorize
    X_new = vectorizer.transform([text_clean])

    # Predict
    prob = clf.predict_proba(X_new)[0][1]
    label = clf.predict(X_new)[0]

    result = "🤖 AI-Generated (label=1)" if label == "GPT" else "🧑 Human-Written (label=0)"
    print(f"🔍 Prediction: {result} (Confidence={prob:.4f})")
    
test_text = """
The world has been shattered by Russia’s attack on Ukraine in the past few days. Ukraine’s forces have put up remarkable resistance to this attack, but the war is entering an even bloodier phase. Europe could be witnessing the first major military invasion in years. Many people have been displaced, and many Ukrainian citizens have fled neighboring countries for peace. Gunshots and artillery fires have been raining down on residential areas in Ukraine, with several media houses and newspapers reporting the events and sharing the news with the rest of the world.
Various newspapers have reported the news differently, with the difference seen in headlines, videos and images posted. The two newspapers covering the events in Ukraine are the New York Times and Le Monde, a French newspaper. While New York Times and Le Monde report the same event, the two use very different terms to describe similar activities. In the recent takeover of the city of Kharkiv by the Russian forces, for example, the NYT headline read “Russian Troops Take Over Kharkiv,” Le Monde , on the other hand, read “La Russie A Envahi Kharkiv” to mean “Russia Has Invaded Kharkiv.” Recently a video of Ukrainian citizens making Molotov cocktail Petrol bombs was shared. The headline in NYT read, “Ukraine Prepare Molotov Cocktail In Kyiv.” Le Monde ‘s headline read “Ukrainian Resistance Using Molotov Bombs.” Because headlines are the deciding factor for people to read an article, newspapers make them enticing. The New York Times uses headlines that suggest this is another world war, while Le Monde downplays the illegality of what Russia is doing. The differences seen in the headlines of the two newspapers are all for the attraction of their audiences; each magazine writes eye-catching unique topics.
"""

predict_text_is_ai_en(test_text)


🔍 Prediction: 🧑 Human-Written (label=0) (Confidence=0.7602)
