In [39]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
import pandas as pd

In [48]:
data_df = pd.read_csv('articles/test.txt', sep='\t', header=None, names=['label', 'text'], encoding='utf-8')

In [46]:
labels = data_df['label'].tolist()
texts = data_df['text'].tolist()

In [49]:
def jieba_tokenizer(text):
    return ' '.join(jieba.cut(text))

texts_tokenized = [jieba_tokenizer(text) for text in texts]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ROSSAN~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.742 seconds.
Prefix dict has been built successfully.


In [50]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts_tokenized).toarray()
y = labels

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

In [54]:
print("GaussianNB Model")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))
print("Classification Report:\n", classification_report(y_test, y_pred_gnb))
print("Accuracy:", accuracy_score(y_test, y_pred_gnb))

GaussianNB Model
Confusion Matrix:
 [[3 0]
 [0 1]]
Classification Report:
               precision    recall  f1-score   support

          体育       1.00      1.00      1.00         3
          娱乐       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

Accuracy: 1.0


In [55]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [56]:
print("\nLogistic Regression Model")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Model
Confusion Matrix:
 [[0 0 3]
 [0 0 1]
 [0 0 0]]
Classification Report:
               precision    recall  f1-score   support

          体育       0.00      0.00      0.00       3.0
          娱乐       0.00      0.00      0.00       1.0
          财经       0.00      0.00      0.00       0.0

    accuracy                           0.00       4.0
   macro avg       0.00      0.00      0.00       4.0
weighted avg       0.00      0.00      0.00       4.0

Accuracy: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
from collections import Counter

label_counts = Counter(y)
print("各类别样本数量:", label_counts)

各类别样本数量: Counter({'体育': 6, '娱乐': 6, '财经': 6})


In [60]:
min_class_count = min(label_counts.values())
cv_folds = min(5, min_class_count)

In [61]:
from sklearn.model_selection import cross_val_score

gnb_cv_scores = cross_val_score(gnb, X, y, cv=cv_folds, scoring='accuracy')
lr_cv_scores = cross_val_score(lr, X, y, cv=cv_folds, scoring='accuracy')


print(f"\nGaussianNB {cv_folds}-fold Cross Validation Accuracy:", gnb_cv_scores.mean())
print(f"Logistic Regression {cv_folds}-fold Cross Validation Accuracy:", lr_cv_scores.mean())

if gnb_cv_scores.mean() > lr_cv_scores.mean():
    print("GaussianNB 算法性能更优")
else:
    print("逻辑回归算法性能更优")


GaussianNB 5-fold Cross Validation Accuracy: 0.95
Logistic Regression 5-fold Cross Validation Accuracy: 0.6833333333333333
GaussianNB 算法性能更优
