In [1]:
import os

import jieba
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
from torch import nn
import torch.optim as optim


MAX_LEN = 10
UNK, PAD = "UNK", "PAD"

In [2]:
##定义读取训练集和测试集的函数
def load_text_data(path):
    text_data = []
    text_label = []
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            line.strip()
            text_data.append(line.split('\t')[0])
            text_label.append(line[-2])
    return np.array(text_data), np.array(text_label)


##读取训练集、验证集和测试集
train_path = "data/THUCNews/train.txt"
train_text, train_label = load_text_data(train_path)
dev_path = "data/THUCNews/dev.txt"
dev_text, dev_label = load_text_data(train_path)
test_path = "data/THUCNews/test.txt"
test_text, test_label = load_text_data(test_path)

180000it [00:00, 1944432.99it/s]
180000it [00:00, 1818848.04it/s]
10000it [00:00, 1773115.20it/s]


In [3]:
import string
import re


##对文本进行预处理
def text_preprocess(text_data):
    text_pre = []
    for text1 in text_data:
        ##去除指定的字符“<br /><br />”
        text1 = re.sub("<br /><br />", "", text1)
        text1 = re.sub(" ", "", text1)
        text1 = re.sub("[^0-9a-zA-Z\u4e00-\u9fa5]+", "", text1)
        ##转化为小写，去除数字，去除标点符号，去除空格
        text1 = text1.lower()
        text1 = re.sub("\d+", "", text1)
        text1 = text1.translate(
            str.maketrans("", "", string.punctuation.replace("'", "")))
        text1 = text1.strip()
        text_pre.append(text1)
    return np.array(text_pre)


train_text_pre = text_preprocess(train_text)
dev_text_pre = text_preprocess(dev_text)
test_text_pre = text_preprocess(test_text)

In [4]:
##中文分词
##文本去符号化,去除停用词
stopwords = []
with open("data/Chinese_stopwords.txt", "r", encoding="utf-8") as f:
    for word in f:
        stopwords.append(word.strip())

def cut_text(texts):
    text_cuts = []
    for text in texts:
        text = str(text)
        text = jieba.lcut(text, cut_all=False)
        text_stopeds = []
        for word in text:
            if word not in stopwords:
                text_stopeds.append(word)
        text_cuts.append(text_stopeds)
    return np.array(text_cuts, dtype=object)

train_text_pre2 = cut_text(train_text_pre)
dev_text_pre2 = cut_text(dev_text_pre)
test_text_pre2 = cut_text(test_text_pre)
print(train_text_pre[0:10])
print("=" * 100)
print(train_text_pre2[0:10])
# print(cut_text(test_text))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.367 seconds.
Prefix dict has been built successfully.


['中华女子学院本科层次仅专业招男生' '两天价网站背后重重迷雾做个网站究竟要多少钱' '东环海棠公社平居准现房折优惠'
 '卡佩罗告诉你德国脚生猛的原因不希望英德战踢点球' '岁老太为学生做饭扫地年获授港大荣誉院士' '记者回访地震中可乐男孩将受邀赴美国参观'
 '冯德伦徐若隔空传情默认其是女友' '传郭晶晶欲落户香港战伦敦奥运装修别墅当婚房' '赤壁ol攻城战诸侯战硝烟又起' '手机钱包亮相科博会']
[list(['中华', '女子', '学院', '本科', '层次', '专业', '招', '男生'])
 list(['两天', '价', '网站', '背后', '重重', '迷雾', '做个', '网站', '钱'])
 list(['东环', '海棠', '公社', '平居', '准现房', '折', '优惠'])
 list(['卡佩罗', '告诉', '德国', '脚', '生猛', '原因', '希望', '英德', '战', '踢', '点球'])
 list(['岁', '老太', '学生', '做饭', '扫地', '年', '获授', '港大', '荣誉', '院士'])
 list(['记者', '回访', '地震', '中', '可乐', '男孩', '受邀', '赴美国', '参观'])
 list(['冯德伦', '徐若', '隔空', '传情', '默认', '其是', '女友'])
 list(['郭晶晶', '欲', '落户', '香港', '战', '伦敦', '奥运', '装修', '别墅', '当婚', '房'])
 list(['赤壁', 'ol', '攻城战', '诸侯', '战', '硝烟'])
 list(['手机', '钱包', '亮相', '科博会'])]


In [5]:
##将处理好的文本保存到CSV文件中
texts = [" ".join(words) for words in train_text_pre2]
traindatasave = pd.DataFrame({"text": texts,
                              "label": train_label})
texts = [" ".join(words) for words in dev_text_pre2]
devdatasave = pd.DataFrame({"text": texts,
                            "label": dev_label})
texts = [" ".join(words) for words in test_text_pre2]
testdatasave = pd.DataFrame({"text": texts,
                             "label": test_label})
traindatasave.to_csv("data\\news_train.csv", index=False)
traindatasave.to_csv("data\\news_dev.csv", index=False)
testdatasave.to_csv("data\\news_test.csv", index=False)

In [6]:
# from matplotlib import pyplot as plt
# 
# ##将预处理好的文本数据转化为数据表
# traindata = pd.DataFrame({"train_text": train_text, "train_word": train_text_pre2, "train_label": train_label})
# ##计算每个影评使用的词数量
# train_word_num = [len(text) for text in train_text_pre2]
# traindata["train_word_num"] = train_word_num
# ##可视化每个影评词语长度的分布
# plt.figure(figsize=(8, 5))
# _ = plt.hist(train_word_num, bins=100)
# plt.xlabel("word number")
# plt.ylabel("Freq")
# plt.show()

In [7]:
# from wordcloud import WordCloud
# 
# # import os
# # FILE=os.path.dirname("C:\ProgramData\\anaconda3\Lib\site-packages\wordcloud")
# # FONT_PATH=os.environ.get('FONT_PATH',os.path.join(FILE,'DroidSansMono.ttf'))
# # w=WordCloud(width=1000,height=700,font_path="msyh.ttc",background_color="white")
# 
# ##使用词云可视化两种情感的词频差异
# plt.figure(figsize=(16, 10))
# for ii in np.unique(train_label):
#     ##准备每种情感的所有词语
#     text = np.array(traindata.train_word[traindata.train_label == ii])
#     text = " ".join(np.concatenate(text))
#     plt.subplot(1, 10, int(ii) + 1)
#     ##生成词云
#     wordcod = WordCloud(margin=5, width=1800, height=1000, max_words=500,
#                         min_font_size=5, background_color='white', max_font_size=250)
#     wordcod.generate_from_text(text)
#     plt.imshow(wordcod)
#     plt.axis("off")
#     plt.subplots_adjust(wspace=0.05)
# plt.show()

In [8]:
import tqdm

##限制词表的最大长度
MAX_VOCAB_SIZE = 50000
##创建词表，按词频降序排列
vocab_dic = {}
with open("./data/news_train.csv", "r", encoding="utf-8") as f:
    f.readline()
    for line in f:
        content = f.readline().split(",")[0]
        tokenizes = lambda x: content.split(" ")
        for word in tokenizes(content):
            vocab_dic[word] = vocab_dic.get(word, 0) + 1
vocab_list = sorted([_ for _ in vocab_dic.items() if len(_) > 1], key=lambda x: x[1], reverse=True)[:MAX_VOCAB_SIZE]
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}  ##enumerate
##向词表中添加未知词，和填充
vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
print(len(vocab_dic))
##保存词表
print(vocab_dic)

50002
{'图': 0, '年': 1, '月': 2, '基金': 3, '组图': 4, '日': 5, '称': 6, '中国': 7, '新': 8, '男子': 9, '美国': 10, '北京': 11, '万': 12, '均价': 13, '高考': 14, '中': 15, '游戏': 16, '市场': 17, '开盘': 18, '元': 19, '遭': 20, '公布': 21, '岁': 22, '考研': 23, '投资': 24, '公司': 25, '沪': 26, '国际': 27, '名': 28, 'ol': 29, '成': 30, '考生': 31, '上海': 32, '精装': 33, '日本': 34, '招生': 35, '亿': 36, '网游': 37, '前': 38, '期货': 39, '股': 40, '死亡': 41, '别墅': 42, '世界': 43, '曝光': 44, '手机': 45, '计划': 46, '平': 47, '起居': 48, '女子': 49, '专家': 50, '高': 51, '获': 52, '图文': 53, '香港': 54, 'a': 55, '万元': 56, '录取': 57, '反弹': 58, '上市': 59, '活动': 60, '学生': 61, '曝': 62, '英国': 63, '考试': 64, '银行': 65, '平米': 66, '美': 67, '现房': 68, '全球': 69, '在售': 70, '指': 71, '震荡': 72, '预计': 73, '做': 74, '送': 75, '推出': 76, '高校': 77, '欲': 78, '调查': 79, '商品': 80, '推': 81, '韩国': 82, '推荐': 83, '发布': 84, '大学': 85, '索尼': 86, 'g': 87, '经济': 88, '有望': 89, '快讯': 90, '总统': 91, '专业': 92, '网络': 93, '女': 94, '媒体': 95, '视频': 96, '网上': 97, '上涨': 98, '大盘': 99, 'd': 100, '网友': 101, '亿美元': 102, 

In [9]:
##定义数据加载器，将数据表达成定长的数字向量
##有点脱裤子放屁可以简化
def dataiter(path):
    train_iter = []
    with open(path, "r", encoding="utf-8") as f:
        f.readline()
        for line in f:
            line = line.strip()
            content = line.split(",")[0]
            l_words = content.split(" ")  ##
            l_rep = [int(line.split(",")[1])]  ##获取label
            if len(l_words) < MAX_LEN:
                l_words.extend([PAD] * (MAX_LEN - len(l_words)))
            else:
                l_words = l_words[:MAX_LEN]
            ##将文本转换成数值
            for word in l_words:
                if word in vocab_dic.keys():
                    l_rep.append(vocab_dic.get(word))
                else:
                    l_rep.append(vocab_dic.get(UNK))
            train_iter.append(l_rep)
    return train_iter


train = dataiter("./data/news_train.csv")
dev = dataiter("./data/news_dev.csv")
test = dataiter("./data/news_test.csv")

In [10]:
print(train[:10])

[[3, 4385, 49, 953, 436, 18145, 92, 1014, 1520, 50001, 50001], [4, 4241, 1931, 184, 652, 6037, 9782, 14192, 184, 653, 50001], [1, 5498, 3860, 5109, 396, 958, 108, 123, 50001, 50001, 50001], [7, 2597, 7116, 296, 5490, 22102, 813, 234, 12719, 506, 1269], [5, 22, 1247, 61, 14576, 44234, 1, 50000, 8948, 1893, 6017], [5, 329, 18561, 256, 15, 8492, 602, 5280, 22103, 5281, 50001], [9, 9288, 9516, 8514, 8518, 6439, 32973, 254, 50001, 50001, 50001], [1, 2769, 78, 2847, 54, 506, 1065, 814, 5282, 42, 38698], [8, 1150, 29, 25871, 5478, 506, 8231, 50001, 50001, 50001, 50001], [4, 45, 7117, 131, 38699, 50001, 50001, 50001, 50001, 50001, 50001]]


In [11]:
##生成batch
##读取数据并构建Dataset子类
from torch.utils.data import Dataset, DataLoader

##创建模型数据集类
class textCNN_dataSet(Dataset):
    def __init__(self, datas):
        self.datas = datas

    def __len__(self):
        return len(self.datas)

    def __getitem__(self, idx):
        data = self.datas[idx]
        cla = data[0]
        sentence = np.array(data[1:])
        return cla, sentence


##定义数据读取方法，生成batch
def textCNN_dataLoader(batch_size, datas):
    dataset = textCNN_dataSet(datas)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [12]:
dataset = textCNN_dataSet(train)
cla, sen = dataset.__getitem__(3)
print(cla)
print(sen)

7
[ 2597  7116   296  5490 22102   813   234 12719   506  1269]


In [13]:
## 定义网络模型

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, class_nums, dropout):
        super().__init__()
        ##参数依次为词表长度，词向量维度，填充字符在词表中的索引
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=-1)
        ##输出通道数是卷积核个数
        self.convs = nn.ModuleList(
            [nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes])
        ##对卷积结果进行拼接操作len(filters_sizes)卷积后的输出维度
        self.fc = nn.Linear(n_filters*len(filter_sizes), class_nums)
        ##dropout是损失率
        self.dropout = nn.Dropout(dropout)

    ##前向传播
    def forward(self, text):
        embedded = self.embedding(text)  ##词嵌入操作得shape[batch_size,sen_len,embedding_dim]
        embedded = embedded.unsqueeze(1)  ##插入一个维度[batch_size,1,sen_len,embedding_dim]
        ##squeeze(3)：池化需要三维数据
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        ##squeeze(2)：拼接需要二位数据，dim代表拼接在一起的维度
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)



In [14]:
VOCAB_SIZE = len(vocab_dic)  ##词表长度
EMBEDDING_DIM = 100  ##词向量维度
N_FILTERS = 20  ##每次卷积卷积核的个数
FILTER_SIZES = [3, 4, 5]  ##卷积核的尺寸
CLASS_NUMS = 10  ##输出维度
DROPOUT = 0.5  ##损失率
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextCNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, CLASS_NUMS, DROPOUT).to(device)

In [21]:
print(model)

TextCNN(
  (embedding): Embedding(50002, 100, padding_idx=50001)
  (convs): ModuleList(
    (0): Conv2d(1, 20, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 20, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 20, kernel_size=(5, 100), stride=(1, 1))
  )
  (fc): Linear(in_features=60, out_features=10, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [15]:
##创建训练数据集
BATCH_SIZE = 128
trainIter = textCNN_dataLoader(BATCH_SIZE, train)
devIter = textCNN_dataLoader(BATCH_SIZE, dev)

In [16]:
##Adam优化
optimizer = optim.Adam(model.parameters(),0.001)

In [17]:
##定义一个对数据集训练一轮的函数
def train_epoch(model, iterator, optimizer):
    epoch_loss = 0
    epoch_acc = 0
    train_corrects = 0
    train_num = 0
    model.train()
    for batch in iterator:
        batch[1]=batch[1].to(device)
        batch[0]=batch[0].to(device)
        optimizer.zero_grad()
        pre = model(batch[1])
        loss = nn.functional.cross_entropy(pre, batch[0])
        pre_lab=torch.argmax(pre,-1)
        train_corrects+=int(torch.sum(pre_lab==batch[0]))
        train_num+=len(batch[1])
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    ##所有样本的平均损失精度
    epoch_loss = epoch_loss / len(iterator)
    epoch_acc = train_corrects / train_num
    return epoch_loss, epoch_acc

##定义数据集验证一轮的函数
def evaluate(model, iterator):
    
    epoch_loss = 0
    epoch_acc = 0
    train_num = 0
    model.eval()
    right_num=0
    with torch.no_grad():
        for batch in iterator:
            batch[1]=batch[1].to(device)
            batch[0]=batch[0].to(device)
            pre = model(batch[1])##batch_size*class_num
            loss = nn.functional.cross_entropy(pre, batch[0])
            pre_lab = torch.argmax(pre,-1)
            right_num+=int(torch.sum(pre_lab==batch[0]))
            train_num+=len(batch[1])
            epoch_loss += loss.item()
        ##所有样本的平均损失精度
        epoch_loss = epoch_loss / len(iterator)
        epoch_acc = right_num / train_num
    return epoch_loss, epoch_acc

In [18]:
import time
import copy

##使用训练集训练模型，使用验证集测试模型
EPOCHS = 10
best_val_loss = float("inf")
best_acc = float(0)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss,train_acc= train_epoch(model, trainIter, optimizer)
    val_loss, val_acc = evaluate(model, devIter)
    end_time = time.time()
    print(f"Epoch:{epoch + 1}|Epoch Time:{end_time - start_time:.2f}s")
    print(f"Train loss:{train_loss:.3f}|train_acc:{train_acc*100:.2f}%")
    print(f"Val loss:{val_loss:.3f}|train_acc:{ val_acc*100:.2f}%")
    ##保存效果较好的模型
    if (val_loss < best_val_loss) & (val_acc > best_acc):
        best_model_wts = copy.deepcopy(model.state_dict())
        best_val_loss = val_loss
        best_acc = val_acc
##将最好的模型参重新赋值给model
model.load_state_dict(best_model_wts)

Epoch:1|Epoch Time:12.11s
Train loss:1.541|train_acc:47.45%
Val loss:0.828|train_acc:75.19%
Epoch:2|Epoch Time:5.66s
Train loss:0.844|train_acc:73.33%
Val loss:0.510|train_acc:85.02%
Epoch:3|Epoch Time:5.65s
Train loss:0.614|train_acc:81.39%
Val loss:0.383|train_acc:88.84%
Epoch:4|Epoch Time:5.56s
Train loss:0.499|train_acc:85.18%
Val loss:0.310|train_acc:90.98%
Epoch:5|Epoch Time:5.51s
Train loss:0.426|train_acc:87.38%
Val loss:0.260|train_acc:92.48%
Epoch:6|Epoch Time:5.50s
Train loss:0.374|train_acc:88.89%
Val loss:0.223|train_acc:93.55%
Epoch:7|Epoch Time:5.48s
Train loss:0.333|train_acc:90.07%
Val loss:0.197|train_acc:94.33%
Epoch:8|Epoch Time:5.53s
Train loss:0.304|train_acc:90.99%
Val loss:0.174|train_acc:95.02%
Epoch:9|Epoch Time:5.79s
Train loss:0.275|train_acc:91.85%
Val loss:0.152|train_acc:95.58%
Epoch:10|Epoch Time:5.49s
Train loss:0.253|train_acc:92.43%
Val loss:0.138|train_acc:96.10%


<All keys matched successfully>

In [19]:
testIter = textCNN_dataLoader(BATCH_SIZE, test)
test_loss, test_acc = evaluate(model, testIter)
print("在测试集上的精度为：", test_acc)


在测试集上的精度为： 0.8865


In [20]:
##保存整个模型
torch.save(model,'T1.pt')
##直接调用训练好的模型
T1=torch.load("T1.pt")
test_loss, test_acc = evaluate(T1, testIter)
print("在测试集上的精度为：", test_acc)
##保存模型参数
torch.save(model.state_dict(),'T2.pt')
##调用
T2=torch.load("T2.pt")
test_loss, test_acc = evaluate(T2, testIter)
print("在测试集上的精度为：", test_acc)


在测试集上的精度为： 0.8865


AttributeError: 'collections.OrderedDict' object has no attribute 'eval'