In [1]:
from datetime import datetime
import os

from data_process.data_scraping import get_news_base_path


def get_latest_news_dir():
    date_format = "%m-%d-%H-%M"
    dir_list = [datetime.strptime(ele, date_format) for ele in os.listdir(get_news_base_path())]
    return max(dir_list).strftime(date_format)


def get_latest_news_path():
    return os.path.join(get_news_base_path(), get_latest_news_dir())


print(get_latest_news_path())

C:\Users\24747\AllProjects\Pycharm\pythonProject\text-classification\src\utils\..\..\data\news\12-03-12-10


In [2]:
import pandas as pd

df = pd.read_csv(os.path.join(get_latest_news_path(), '体育.txt'), sep='\n', header=None)
df.columns = ['content']
df['category'] = '体育'
print(df.head())

                                             content category
0  前中国体操队长创造历史！打封闭夺世锦赛首金，弥补弱势项目空白_商春松_决赛_高低杠跑酷这项极...       体育
1  青岛市滑板队参加2024年山东省滑板锦标赛获12金12银6铜_金牌_团体_乙组11月17日，...       体育
2  历经45年，“六姐妹”实现满堂红，中国体操创造世锦赛大满贯伟业_项目_世界_奥运商春松问鼎2...       体育
3  潍坊市潍城区芙蓉小学举行队列队形暨广播体操比赛_智则国_都为一规一矩有章法，一言一行好习惯。...       体育
4  “长江潮”华东青少年赛艇公开赛在南京举办_泱波_比赛_七里河中新网江苏新闻11月18日电 (...       体育


In [3]:
import re
from src.utils.data_input import get_path
import pandas as pd
import jieba


def read_news_data():
    """
    将最近一次news数据读入
    :return: Dataframe columns=['content', 'category']
    """
    data = pd.DataFrame(columns=['content', 'category'])
    for category_file_name in os.listdir(get_latest_news_path()):
        category_path = os.path.join(get_latest_news_path(), category_file_name)
        category_data = pd.read_csv(category_path, sep='\n', header=None)
        category_data.columns = ['content']
        print(f'{category_file_name}\t{category_data.shape[0]}')
        category_data['category'] = category_file_name.rstrip('.txt')
        data = pd.concat([data, category_data], ignore_index=True)
    return data


def cut_remove_stopwords(text, stop_words: set):
    # 使用正则表达式去除非中文字符，只保留中文
    new_text = "".join(re.findall('[\u4e00-\u9fa5]+', text, re.S))
    # 使用jieba进行分词
    words = jieba.lcut(new_text)
    # 去除停用词
    filtered_words = [word for word in words if word not in stop_words and len(word) > 1]
    return ' '.join(filtered_words)


def news_data_process(data: pd.DataFrame, stopwords_path=os.path.join(get_path(), 'stopwords.txt')):
    """
    1.对分类进行编码
    2.对新闻文本进行分词
    3.对新闻文本去除停用词
    :param stopwords_path: 停用词路径
    :param data: 新闻文本数据
    :return: 处理好的data
    """
    # 对category进行编码
    data['label'] = pd.factorize(data['category'])[0]
    # 对content进行分词并去除停用词
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        stop_words = set([line.strip() for line in f.readlines()])
    data['content'] = data['content'].apply(lambda x: cut_remove_stopwords(x, stop_words))
    return data


def write_news_data(data: pd.DataFrame):
    # 创建保存路径
    folder_path = os.path.join(get_path(), get_latest_news_dir())
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    data.to_csv(os.path.join(folder_path, 'news.csv'), index=False)


dt = read_news_data()
dt = news_data_process(dt)
print(dt.head())
print(dt.shape)
write_news_data(dt)

体育.txt	1966
健康.txt	1599
公益.txt	1571
军事.txt	2262
历史.txt	1232
房产.txt	2109
政务.txt	1720
教育.txt	1045
旅游.txt	2575
游戏.txt	2225
科技.txt	1678


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\24747\AppData\Local\Temp\jieba.cache


美食.txt	1156
财经.txt	2379


Loading model cost 0.534 seconds.
Prefix dict has been built successfully.


                                             content category  label
0  中国 体操 队长 创造 历史 封闭 世锦赛 首金 弥补 弱势 项目 空白 商春松 决赛 高低...       体育      0
1  青岛市 滑板 参加 山东省 滑板 锦标赛 金银铜 金牌 团体 乙组 中国 体育彩票 山东省 ...       体育      0
2  历经 姐妹 满堂红 中国 体操 创造 世锦赛 大满贯 伟业 项目 世界 奥运 商春松 问鼎 ...       体育      0
3  潍坊市 潍城区 芙蓉 小学 队列 队形 广播体操 比赛 智则国 一规 一矩 章法 一言一行 ...       体育      0
4  长江 华东 青少年 赛艇 公开赛 南京 举办 泱波 比赛 七里河 中新网 江苏 新闻 日电 ...       体育      0
(23517, 3)


In [4]:
from src.utils.data_input import get_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

X = dt['content']  # 特征数据
y = dt['label']  # 标签数据

# 分割数据集，测试集占30%，随机状态设置为42以保证结果可复现
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, y_train, X_test, y_test = get_dataset()
# 创建一个管道，包括CountVectorizer、TfidfTransformer和MultinomialNB
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

# 在训练数据上拟合模型
pipeline.fit(X_train, y_train)

# 在测试数据上进行预测
y_pred = pipeline.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.2f}")
from sklearn.metrics import precision_score, recall_score, f1_score

# 假设 y_test 是真实的标签，y_pred 是模型预测的标签
# 计算精确率
precision = precision_score(y_test, y_pred, average='macro')  # 'macro'表示对所有类别的平均值
# 计算召回率
recall = recall_score(y_test, y_pred, average='macro')  # 'macro'表示对所有类别的平均值
# 计算F1值
f1 = f1_score(y_test, y_pred, average='macro')  # 'macro'表示对所有类别的平均值

print(f"精确率(Precision): {precision:.2f}")
print(f"召回率(Recall): {recall:.2f}")
print(f"F1值: {f1:.2f}")

模型准确率: 0.82
精确率(Precision): 0.85
召回率(Recall): 0.80
F1值: 0.81


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# 准备文本数据
documents = [
    "Kimi is a chatbot developed by Moonshot AI.",
    "Moonshot AI is a company that creates AI assistants.",
    "Kimi can process text and assist with various tasks."
]

# 使用 CountVectorizer 进行词频统计
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(documents)
print("词汇表:", vectorizer.vocabulary_)
print("词频矩阵:\n", X_counts.toarray())

# 使用 TfidfTransformer 转换为 TF-IDF 特征
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_counts)
print("TF-IDF 矩阵:\n", X_tfidf.toarray())


词汇表: {'kimi': 11, 'is': 10, 'chatbot': 6, 'developed': 9, 'by': 4, 'moonshot': 12, 'ai': 0, 'company': 7, 'that': 16, 'creates': 8, 'assistants': 3, 'can': 5, 'process': 13, 'text': 15, 'and': 1, 'assist': 2, 'with': 18, 'various': 17, 'tasks': 14}
词频矩阵:
 [[1 0 0 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0]
 [2 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 1 0 0]
 [0 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 1]]
TF-IDF 矩阵:
 [[0.32992832 0.         0.         0.         0.43381609 0.
  0.43381609 0.         0.         0.43381609 0.32992832 0.32992832
  0.32992832 0.         0.         0.         0.         0.
  0.        ]
 [0.55650888 0.         0.         0.36587115 0.         0.
  0.         0.36587115 0.36587115 0.         0.27825444 0.
  0.27825444 0.         0.         0.         0.36587115 0.
  0.        ]
 [0.         0.34142622 0.34142622 0.         0.         0.34142622
  0.         0.         0.         0.         0.         0.25966344
  0.         0.34142622 0.34142622 0.34142622 0.         0.34142622
  0.3414