In [3]:
# 安装必要的库
!pip install scikit-learn

# 导入必要的库
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 下载20类新闻组数据集
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
X, y = newsgroups.data, newsgroups.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用CountVectorizer进行特征提取
count_vectorizer = CountVectorizer(stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# 使用TfidfVectorizer进行特征提取
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 定义分类器
classifiers = {
    'LinearSVC': LinearSVC(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# 评估每个分类器的性能
for vec_name, (X_train_vec, X_test_vec) in zip(['CountVectorizer', 'TfidfVectorizer'], [(X_train_count, X_test_count), (X_train_tfidf, X_test_tfidf)]):
    for clf_name, clf in classifiers.items():
        clf.fit(X_train_vec, y_train)
        y_pred = clf.predict(X_test_vec)

        print(f"Results for {vec_name} with {clf_name}:")
        print(classification_report(y_test, y_pred))
        print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")






Results for CountVectorizer with LinearSVC:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89       151
           1       0.80      0.81      0.80       202
           2       0.86      0.85      0.85       195
           3       0.70      0.73      0.72       183
           4       0.83      0.83      0.83       205
           5       0.88      0.88      0.88       215
           6       0.84      0.82      0.83       193
           7       0.91      0.94      0.92       196
           8       0.95      0.96      0.95       168
           9       0.96      0.95      0.95       211
          10       0.97      0.98      0.97       198
          11       0.98      0.96      0.97       201
          12       0.85      0.80      0.82       202
          13       0.90      0.96      0.93       194
          14       0.95      0.96      0.96       189
          15       0.92      0.98      0.95       202
          16       0.93      0.94    