In [20]:
#Imports
import pandas as pd # Pandas数据处理
from sklearn.model_selection import train_test_split, GridSearchCV # 训练集和测试集划分器，网格搜索交叉验证参数优化器
from sklearn.feature_extraction.text import TfidfVectorizer # 文本向量化器
from sklearn.naive_bayes import MultinomialNB # 多项式朴素贝叶斯分类器
from sklearn.pipeline import Pipeline # 管道
from sklearn.metrics import accuracy_score, classification_report  # 准确率，分类报告生成器
import nltk # 自然语言处理工具包
from nltk.stem.porter import PorterStemmer # 词干提取器
import re # 正则表达式
import pickle # 模型保存

## 数据读取和了解

In [2]:
#从CSV文件中读入数据集
dataset = pd.read_csv("news.csv")

In [3]:
#查看数据集样本数和特征数
print(dataset.shape)

(6335, 4)


In [4]:
#查看前五行，了解各特征列
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
#查看数据集中各特征列是否有缺失值
dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

## 数据和文本处理

In [6]:
# 将标签转换为数值型：FAKE 为 0，REAL 为 1
dataset['label'] = dataset['label'].map({'FAKE': 0, 'REAL': 1})

In [7]:
# 将标题和正文合并
dataset['combined_text'] = dataset['title'] + ' ' + dataset['text'] 

In [8]:
# 初始化 PorterStemmer
stemmer = PorterStemmer()

# 定义词干提取函数
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # 使用正则表达式将所有非字母字符替换为空格
    stemmed_content = stemmed_content.lower() # 使用Python的lower()方法将所有字母转换为小写
    stemmed_content = stemmed_content.split() # 使用Python的split()方法将文本分割为单词
    stemmed_content = [stemmer.stem(word) for word in stemmed_content] # 使用PorterStemmer将单词提取词干
    stemmed_content = ' '.join(stemmed_content) # 使用Python的join()方法将提取词干后的单词重新连接为文本
    return stemmed_content

# 使用Pandas的apply()方法应用词干提取函数
dataset['combined_text'] = dataset['combined_text'].apply(stemming) 

In [10]:
# 创建管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english',max_df=0.7)),  # 使用TfidfVectorizer将文本转换为TF-IDF特征向量
    ('clf', MultinomialNB()) # 使用MultinomialNB分类器
])

# 定义参数网格
parameters = {
    'tfidf__max_df': (0.5, 0.75, 1.0),
    'tfidf__max_features': (None, 5000, 10000, 50000),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'clf__alpha': (0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10),
}

# 创建网格搜索交叉验证对象
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

## 训练模型并进行评估

In [13]:
# 划分训练集和测试集
X=dataset['combined_text']
y=dataset['label']
X_train, X_test, y_train, y_test = train_test_split(dataset['combined_text'],dataset['label'], test_size=0.2, random_state=0, stratify=y)

In [14]:
# 训练网格搜索交叉验证模型
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 168 candidates, totalling 840 fits


In [16]:
# 输出最佳参数和交叉验证最佳分数
print("最佳参数:", grid_search.best_params_)
print("最佳分数 (交叉验证):", grid_search.best_score_)
# 获取最佳模型
best_model = grid_search.best_estimator_

最佳参数: {'clf__alpha': 0.0001, 'tfidf__max_df': 0.75, 'tfidf__max_features': 50000, 'tfidf__ngram_range': (1, 2)}
最佳分数 (交叉验证): 0.9216643204417523


In [17]:
# 使用最佳模型进行预测和评估
# 预测
y_pred = best_model.predict(X_test)

In [18]:
# 评估模型
print("准确率:", accuracy_score(y_test, y_pred))
print("\n分类报告:\n", classification_report(y_test, y_pred))

准确率: 0.9226519337016574

分类报告:
               precision    recall  f1-score   support

           0       0.94      0.91      0.92       633
           1       0.91      0.94      0.92       634

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [22]:
# 保存模型到文件
filename = 'best_model.pkl'
pickle.dump(best_model, open(filename, 'wb'))