In [None]:
# !pip install jieba
# !pip install xgboost

In [1]:
import os
import time
import jieba
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

In [3]:
def read_data(data_path):
    """
    读取原始数据，分词，返回titles、labels
    """
    titles, labels = [], []
    with open(data_path, 'r', encoding='utf-8') as f:
        print('current file:', data_path)
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            _, _, label, title, _ = line.split('_!_')
            words = ' '.join(jieba.cut(title))  # 分词
            titles.append(words), labels.append(label)
        print(data_path, 'finish')
    return titles, labels

In [18]:
def get_features(feature_num=10000):
    """分词，使用词袋法、获得tf-idf特征"""
    # 读取train dev test数据
    path = os.getcwd()
    train_path = os.path.join(path,"data\\toutiao_cat_data.train.txt")
    dev_path = os.path.join(path,"data\\toutiao_cat_data.dev.txt")
    test_path = os.path.join(path,"data\\toutiao_cat_data.test.txt")
    (train_titles, train_labels), (dev_titles, dev_labels), (test_titles, test_labels) = \
        read_data(train_path), read_data(dev_path), read_data(test_path)

    # 设置TfidfVectorizer，并训练
    vectorizer = TfidfVectorizer(max_features=feature_num)  # 设定max_features，保留词频最高的feature_num个词
    vectorizer.fit(train_titles)  # 输入训练集分词结果，通过fit方法拟合vectorizer

    # 打印前后100个特征词
    print('head 100 words:', ' '.join(vectorizer.get_feature_names_out()[:100]))
    print('tail 100 words:', ' '.join(vectorizer.get_feature_names_out()[-100:]))

    # 转换titles为tfidf矩阵
    # 通过transform方法将分好词的text转化为tfidf的float矩阵
    train_X, dev_X, test_X = \
        vectorizer.transform(train_titles), vectorizer.transform(dev_titles), vectorizer.transform(test_titles)
    print('shape:', train_X.shape, dev_X.shape, test_X.shape)

    return train_X, train_labels, dev_X, dev_labels, test_X, test_labels

In [7]:
def train_random_forest():
    """训练随机森林分类器"""
    train_X, train_labels, dev_X, dev_labels, test_X, test_labels = get_features(feature_num=10000)

    # 为RF设置参数，并训练
    clf = RandomForestClassifier(
        n_estimators=100,  # 设置森林里决策树的个数为100个
        criterion='gini',  # 设置使用基尼指数决定划分属性
        max_depth=200,  # 树的最大深度，None为不限制树深，控制模型拟合程度
        max_features='sqrt',  # 每次选择最优时，考虑的最大特征数，即sqrt(n_features)
        bootstrap=True,  # 使用自助采样，获得样本子集
        class_weight='balanced',  # 平衡根据类别频率，平衡权重
        random_state=0,  # 设置随机种子，可复现的随机
        n_jobs=-1,  # 启用所有cpu，并行训练
    )
    init_time = time.time()
    clf.fit(train_X, train_labels)
    print('train rf finish, cost time: {}s'.format(time.time() - init_time))

    # 评估train acc、dev acc、test acc
    train_acc = clf.score(train_X, train_labels)
    dev_acc = clf.score(dev_X, dev_labels)
    test_acc = clf.score(test_X, test_labels)
    print('train acc:', train_acc)
    print('dev acc:', dev_acc)
    print('test acc:', test_acc)

In [17]:
def train_gbdt():
    """训练gbdt模型"""
    train_X, train_labels, dev_X, dev_labels, test_X, test_labels = get_features(feature_num=10000)

    # 为GBDT设置参数，并训练
    clf = GradientBoostingClassifier(
        n_estimators=100,  # 设置森林里决策树的个数为100个
        learning_rate=0.1,  # 学习率
        loss='log_loss',  # 损失函数，deviance即偏差、残差
        subsample=1.0,  # 训练个体学习器时，可以允许采样的百分比（类似bagging算法，带来样本的扰动），默认为1.0，表示不采样
        max_depth=100,  # 设置最大树深
    )
    init_time = time.time()
    clf.fit(train_X, train_labels)
    print('train gbdt finish, cost time: {}s'.format(time.time() - init_time))

    # 评估train acc、dev acc、test acc
    train_acc = clf.score(train_X, train_labels)
    dev_acc = clf.score(dev_X, dev_labels)
    test_acc = clf.score(test_X, test_labels)
    print('train acc:', train_acc)
    print('dev acc:', dev_acc)
    print('test acc:', test_acc)

In [13]:
def train_xgboost():
    """训练xgb"""
    train_X, train_labels, dev_X, dev_labels, test_X, test_labels = get_features(feature_num=10000)

    clf = XGBClassifier(
        n_estimators=100,  # 设置森林决策树为100棵
        learning_rate=0.1,  # 学习率
        booster='gbtree',  # 个体学习器类型为gbtree，即CART决策树
        objective='multi:softmax',  # 目标，多分类softmax
        max_depth=100,  # 设置最大树深为100
        subsample=1.0,  # 训练个体学习器时，可以允许采样的百分比（类似bagging算法，带来样本的扰动），默认为1.0，表示不采样
        reg_lambda=1,  # l2正则化系数，与正则化强度成正比（与lr svm的C互为倒数）
        random_state=0,  # 固定随机种子
        n_jobs=-1,  # 启用所有cpu，并行训练
    )
    init_time = time.time()
    clf.fit(train_X, train_labels)
    print('train xgboost finish, cost time: {}s'.format(time.time() - init_time))

    # 评估train acc、dev acc、test acc
    train_acc = clf.score(train_X, train_labels)
    dev_acc = clf.score(dev_X, dev_labels)
    test_acc = clf.score(test_X, test_labels)
    print('train acc:', train_acc)
    print('dev acc:', dev_acc)
    print('test acc:', test_acc)

In [12]:
def train_xgboost():
    """训练xgb"""
    train_X, train_labels, dev_X, dev_labels, test_X, test_labels = get_features(feature_num=10000)

    clf = XGBClassifier(
        n_estimators=100,  # 设置森林决策树为100棵
        learning_rate=0.1,  # 学习率
        booster='gbtree',  # 个体学习器类型为gbtree，即CART决策树
        objective='multi:softmax',  # 目标，多分类softmax
        max_depth=100,  # 设置最大树深为100
        subsample=1.0,  # 训练个体学习器时，可以允许采样的百分比（类似bagging算法，带来样本的扰动），默认为1.0，表示不采样
        reg_lambda=1,  # l2正则化系数，与正则化强度成正比（与lr svm的C互为倒数）
        random_state=0,  # 固定随机种子
        n_jobs=-1,  # 启用所有cpu，并行训练
    )
    init_time = time.time()
    clf.fit(train_X, train_labels)
    print('train xgboost finish, cost time: {}s'.format(time.time() - init_time))

    # 评估train acc、dev acc、test acc
    train_acc = clf.score(train_X, train_labels)
    dev_acc = clf.score(dev_X, dev_labels)
    test_acc = clf.score(test_X, test_labels)
    print('train acc:', train_acc)
    print('dev acc:', dev_acc)
    print('test acc:', test_acc)

In [None]:
train_random_forest()

In [11]:
train_gbdt()

current file: data/toutiao_cat_data.train.txt
data/toutiao_cat_data.train.txt finish
current file: data/toutiao_cat_data.dev.txt
data/toutiao_cat_data.dev.txt finish
current file: data/toutiao_cat_data.test.txt
data/toutiao_cat_data.test.txt finish
head 100 words: 00 01 02 03 04 05 052d 055 06 07 08 09 10 100 1000 10000 101 102 103 104 108 11 110 112 12 120 1200 125 128 13 130 1300 14 140 15 150 1500 16 160 1600 1688 17 170 18 180 180508 180509 180512 19 1945 20 200 2000 2008 2013 2014 2015 2016 2017 2018 2019 2020 2022 2024 2025 21 211 22 23 24 240 25 250 2500 26 27 28 29 30 300 3000 31 318 32 3200 33 34 35 350 3500 36 360 365 37 38 39 3d 40 400 4000
tail 100 words: 高盛 高科技 高空 高端 高管 高级 高考 高考状元 高能 高薪 高调 高质量 高贵 高超音速 高跟鞋 高达 高送 高通 高速 高速公路 高配 高铁 高颜值 高龄 鬼子 鬼才 魅力 魅族 魔兽 魔咒 魔王 魔鬼 鱼雷 鲁班 鲁能 鲁迅 鲜为人知 鲜肉 鲜花 鲨鱼 鸟巢 鸡蛋 鸦鹊 鸿雁 鸿鹄 鹈鹕 鹿岛 鹿晗 鹿角 麒麟 麦凯恩 麦迪 麻将 麻烦 麻辣 黄圣 黄圣依 黄子 黄家驹 黄山 黄晓明 黄景 黄河 黄海 黄渤 黄瓜 黄磊 黄紫昌 黄花 黄金 黎巴嫩 黎明 黑人 黑名单 黑客 黑帮 黑幕 黑暗 黑白 黑色 黑马 黑龙江 黑龙江省 默克尔 默默 鼓励 鼓掌 鼠标 鼻子 鼻祖 齐发 齐名 齐聚 齐达内 龙头 龙头股 龙岩 龙湖 龙虎榜 龙

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-951859149f11>", line 1, in <module>
    train_gbdt()
  File "<ipython-input-8-207848546f24>", line 14, in train_gbdt
    clf.fit(train_X, train_labels)
  File "D:\Anaconda\lib\site-packages\sklearn\ensemble\_gb.py", line 668, in fit
    n_stages = self._fit_stages(
  File "D:\Anaconda\lib\site-packages\sklearn\ensemble\_gb.py", line 745, in _fit_stages
    raw_predictions = self._fit_stage(
  File "D:\Anaconda\lib\site-packages\sklearn\ensemble\_gb.py", line 247, in _fit_stage
    tree.fit(X, residual, sample_weight=sample_weight, check_input=False)
  File "D:\Anaconda\lib\site-packages\sklearn\tree\_classes.py", line 1342, in fit
    super().fit(
  File "D:\Anaconda\lib\site-packages\sklearn\tree\_classes.py", line 458, in fit
    builder.build(self.tree_, X, y, sample_weight)


TypeError: object of type 'NoneType' has no len()