In [10]:
#Imports
import pandas as pd # 数据处理器
from sklearn.model_selection import train_test_split # 训练集和测试集划分器
from sklearn.feature_extraction.text import TfidfVectorizer # 文本向量化器
from sklearn.naive_bayes import MultinomialNB # 多项式朴素贝叶斯分类器
from sklearn.metrics import accuracy_score, classification_report  # 准确率，分类报告生成器
import nltk # 自然语言处理工具包
from nltk.stem.porter import PorterStemmer # 词干提取器
import re # 正则表达式

from joblib import dump # 模型保存器
import os # 系统操作库

## 数据读取和了解

In [11]:
# 构建目标文件的路径
current_dir = os.getcwd()
target_file_path = os.path.join(current_dir, '..', 'data', 'news.csv')

# 读取CSV文件中的数据集
dataset = pd.read_csv(target_file_path)

In [12]:
#查看数据集样本数和特征数
print(dataset.shape)

(6335, 4)


In [13]:
#查看前五行，了解各特征列
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [14]:
#查看数据集中各特征列是否有缺失值
dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

## 数据和文本处理

In [15]:
# 将标签转换为数值型：FAKE 为 0，REAL 为 1
dataset['label'] = dataset['label'].map({'FAKE': 0, 'REAL': 1})

In [16]:
# 将标题和正文合并
dataset['combined_text'] = dataset['title'] + ' ' + dataset['text'] 

In [17]:
# 初始化 PorterStemmer
stemmer = PorterStemmer()

# 定义 stemming 函数
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # 使用正则表达式将所有非字母字符替换为空格
    stemmed_content = stemmed_content.lower() # 使用Python的lower()方法将所有字母转换为小写
    stemmed_content = stemmed_content.split() # 使用Python的split()方法将文本分割为单词
    stemmed_content = [stemmer.stem(word) for word in stemmed_content] # 使用PorterStemmer将单词提取词干
    stemmed_content = ' '.join(stemmed_content) # 使用Python的join()方法将提取词干后的单词重新连接为文本
    return stemmed_content

# 使用Pandas的apply()方法应用 stemming 函数
dataset['combined_text'] = dataset['combined_text'].apply(stemming) 

In [18]:
# 使用 TF-IDF 将文本转换为向量
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7) # 去除停用词，忽略出现频率高于 0.7 的词
x = tfidf.fit_transform(dataset['combined_text']) # 将文本转换为向量保存作为特征
y = dataset['label'] # 将标签保存作为目标

In [19]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

## 训练模型并进行评估

In [20]:
# 训练模型
model = MultinomialNB()
model.fit(X_train, y_train)

In [21]:
# 预测
y_pred = model.predict(X_test)

In [22]:
# 评估模型
print("准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:\n", classification_report(y_test, y_pred))

准确率: 0.8413575374901342

分类报告:
               precision    recall  f1-score   support

           0       0.97      0.70      0.82       633
           1       0.77      0.98      0.86       634

    accuracy                           0.84      1267
   macro avg       0.87      0.84      0.84      1267
weighted avg       0.87      0.84      0.84      1267



In [23]:
MultinomialNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}

In [24]:
# 保存模型
current_dir = os.getcwd()
filename = os.path.join(current_dir, '..', 'models', 'MultinomialNB.joblib')
dump(model, filename)

['d:\\OneDrive\\Workspace\\News-Judgement\\notebooks\\..\\models\\MultinomialNB.joblib']