In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 加载黄果山好评数据集
df = pd.read_csv('huangguo_mountain.csv', encoding='UTF-8')
df = df.dropna()

contents = df['content']
labels = df['label']

#将文本中的词语转换为词频矩阵 矩阵元素a[i][j]表示j词在i类文本下的词频
vectorizer = CountVectorizer(min_df=5)

#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()

#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))

# 获取词袋模型中的所有词语  
word = vectorizer.get_feature_names_out()
print("词汇数量:", len(word))

#将tf-idf矩阵抽取出来，元素w[i][j]表示j词在i类文本中的tf-idf权重
X = coo_matrix(tfidf, dtype=np.float32).toarray()           #稀疏矩阵 注意float

# 随机划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=1)

# 创建MLP
classifier = MLPClassifier()

# 训练
classifier.fit(X_train, y_train)

print('模型的准确度: ', classifier.score(X_test, y_test))

y_preds = classifier.predict(X_test)
print(classification_report(y_test, y_preds))

词汇数量: 4923
模型的准确度:  0.9890230515916575
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1527
           1       0.94      1.00      0.97       295

    accuracy                           0.99      1822
   macro avg       0.97      0.99      0.98      1822
weighted avg       0.99      0.99      0.99      1822

