In [1]:
import numpy as np
import os
import tensorflow.contrib.keras as kr

import torch
from torch import nn
import torch.nn.functional as F


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  import pandas.util.testing as tm


In [2]:
# 读取词汇表
def read_vocab(vocab_dir):
    with open(vocab_dir, 'r', encoding='utf-8', errors='ignore') as fp:
        words = [_.strip() for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id
 
# 读取分类目录，固定
def read_category():
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    categories = [x for x in categories]
    cat_to_id = dict(zip(categories, range(len(categories)))) 
    return categories, cat_to_id
 
# 将文件转换为id表示
def process_file(filename, word_to_id, cat_to_id, max_length=600):
    contents, labels = [], []
    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])#将每句话id化
        label_id.append(cat_to_id[labels[i]])#每句话对应的类别的id
    
    # # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    x_pad = torch.LongTensor(x_pad)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示
    
    return x_pad, y_pad

In [3]:
# 获取文本的类别及其对应id的字典
categories, cat_to_id = read_category()
print(categories)
# 获取训练文本中所有出现过的字及其所对应的id
words, word_to_id = read_vocab('./dataset/cnews.vocab.txt')
#print(words)
#print(word_to_id)

#获取字数
vocab_size = len(words)


['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']


In [5]:
# 数据加载及分批
# 获取训练数据每个字的id和对应标签的one-hot形式
x_train, y_train = process_file('./dataset/cnews.train.txt', word_to_id, cat_to_id, 600)
# print('x_train=', x_train)
x_val, y_val = process_file('./dataset/cnews.val.txt', word_to_id, cat_to_id, 600)

In [8]:
# print(x_train.shape)
# print(len(words))
torch.max(x_train)

tensor(4999)

In [3]:
# TextRNN Model

 
# 文本分类，RNN模型
class TextRNN(nn.Module):   
    def __init__(self):
        super(TextRNN, self).__init__()
        # 三个待输入的数据
        self.embedding = nn.Embedding(5000, 64)  # 进行词嵌入
#         self.rnn = nn.LSTM(input_size=64, hidden_size=128, num_layers=2, bidirectional=True, batch_first=True)
        self.rnn = nn.GRU(input_size=64, hidden_size=128, num_layers=2, bidirectional=True, dropout=0.5, batch_first=True)
#         self.f1 = nn.Sequential(nn.Linear(256,128),
#                                 nn.Dropout(0.8),
#                                 nn.ReLU())
        self.fc = nn.Sequential(nn.Linear(256,10),
                                nn.Softmax())
 
    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.rnn(x)
        x = F.dropout(x,p=0.8)
        x = self.fc(x[:,-1,:])
        return x

In [4]:
from torch.utils.data import DataLoader, Dataset
class textData(Dataset):
    """
        下载数据、初始化数据，都可以在这里完成
    """
    def __init__(self, train=False, val=False):
        categories, cat_to_id = read_category()
        words, word_to_id = read_vocab('./dataset/cnews.vocab.txt')
        if train:
            # 数据加载及分批
            # 获取训练数据每个字的id和对应标签的one-hot形式
            self.data, self.label = process_file('./dataset/cnews.train.txt', word_to_id, cat_to_id, 600)
        elif val:
            self.data, self.label = process_file('./dataset/cnews.val.txt', word_to_id, cat_to_id, 600)
        else:
            self.data, self.label = process_file('./dataset/cnews.test.txt', word_to_id, cat_to_id, 600)
    
    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return self.data.shape[0]


In [5]:
import torch.optim as optim
import time
from sklearn import metrics

def evaluate(model, data_loader, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    for i, data in enumerate(val_loader):
        x, y = data
        texts, labels = x.to(device), y.to(device)
        with torch.no_grad():
            outputs = model(texts)
#             loss = F.cross_entropy(outputs, labels)
#             loss_total += loss
            labels = torch.argmax(labels, 1).cpu().numpy()
            predic = torch.argmax(outputs, 1).cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
#     if test:
# #         report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
#         confusion = metrics.confusion_matrix(labels_all, predict_all)
#         return acc, loss_total / len(data_loader.dataset), report, confusion
#     return acc, loss_total / len(data_loader.dataset)
    return acc
# def evaluate(model, loader):
#     model.eval()
#     correct = 0
#     total = len(loader.dataset)
#     for x, y in loader:
#         x, y = x.to(device), y.to(device)
#         with torch.no_grad():
#             logits = model(x)
#             pred = torch.max(F.softmax(logits), 1)[1]
#         correct += torch.eq(pred, y).sum().float().item()
#     return correct / total


EPOCH = 20
batch_size = 32
train_data = textData(train=True)
val_data = textData(val=True)
test_data = textData()
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
model = TextRNN()

#损失函数:这里用交叉熵
criterion = nn.MultiLabelSoftMarginLoss()
# criterion = nn.CrossEntropyLoss()
#优化器 这里用SGD
optimizer = optim.Adam(model.parameters(), lr=1e-3)

#device : GPU or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = "cpu"
model.to(device)

best_acc, best_epoch = 0, 0
file_name = 'best_model.pt'

# 训练
for epoch in range(EPOCH):
    start_time = time.time()
    for i, data in enumerate(train_loader):
        model.train()
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        # 前向传播
        outputs = model(inputs)
        # 清空上一轮梯度
        optimizer.zero_grad()
        # 计算损失函数
        loss = criterion(outputs, labels)
        # 反向传播
        loss.backward()
        # 参数更新
        optimizer.step()
        accuracy = np.mean((torch.argmax(outputs, 1) == torch.argmax(labels, 1)).cpu().numpy())
    print('epoch{} loss:{:.4f} acc:{:.4f} time:{:.4f}'.format(epoch+1, loss.item(), accuracy, time.time()-start_time))
#     for step,(x_batch,y_batch) in enumerate(val_loader):
#         x = x_batch.cuda()
#         y = y_batch.cuda()
#         out = model(x)
#         #计算准确率
#         val_accuracy = np.mean((torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
#         if val_accuracy > best_acc:
#             torch.save(model.state_dict(),'model_params.pkl')
#             best_acc = val_accuracy
#     print(val_accuracy)
    if epoch % 2 == 0:
        val_acc = evaluate(model, val_loader)
        print('epoch{}  val_acc:{:.4f}'.format(epoch+1, val_acc))
        if val_acc > best_acc:
            best_epoch = epoch
            best_acc = val_acc
            torch.save(model.state_dict(), file_name)
        print('best acc:', best_acc, 'best epoch:', best_epoch)





  input = module(input)


epoch1 loss:0.6674 acc:0.8750 time:205.4436
epoch1  val_acc:0.7212
best acc: 0.7212 best epoch: 0
epoch2 loss:0.6689 acc:0.8750 time:205.3844
epoch3 loss:0.6785 acc:0.7500 time:205.4206
epoch3  val_acc:0.7326
best acc: 0.7326 best epoch: 2
epoch4 loss:0.6615 acc:0.9375 time:205.1645
epoch5 loss:0.6617 acc:0.9375 time:205.0819
epoch5  val_acc:0.8356
best acc: 0.8356 best epoch: 4
epoch6 loss:0.6677 acc:0.8750 time:204.8641
epoch7 loss:0.6563 acc:1.0000 time:204.9196
epoch7  val_acc:0.8580
best acc: 0.858 best epoch: 6
epoch8 loss:0.6552 acc:1.0000 time:204.9671
epoch9 loss:0.6648 acc:0.8750 time:205.0412
epoch9  val_acc:0.8806
best acc: 0.8806 best epoch: 8
epoch10 loss:0.6552 acc:1.0000 time:204.5357
epoch11 loss:0.6599 acc:0.9375 time:204.5681
epoch11  val_acc:0.8928
best acc: 0.8928 best epoch: 10
epoch12 loss:0.6552 acc:1.0000 time:204.5983
epoch13 loss:0.6552 acc:1.0000 time:204.7678
epoch13  val_acc:0.8948
best acc: 0.8948 best epoch: 12
epoch14 loss:0.6552 acc:1.0000 time:204.463

In [15]:
# 加载模型
test_data = textData()
test_loader =  DataLoader(test_data, batch_size=batch_size, shuffle=False)

model = TextRNN()
model.to(device)
model.load_state_dict(torch.load(file_name))
model.eval()

correct, total = 0, 0

for data in test_loader:
    text_data, label = data
    text_data, label = text_data.to(device), label.to(device)
    # 前向传播
    out = model(text_data)
    _, predicted = torch.max(out.data, 1)
    total += label.size(0)
    correct += (predicted == torch.argmax(label, 1)).sum().item()

#输出识别准确率
print('测试文本分类 准确率:{:.4f}%'.format(100.0 * correct / total)) 

测试文本分类 准确率:95.2900%


In [17]:
class RnnModel:
    def __init__(self):
        self.categories, self.cat_to_id = read_category()
        self.words, self.word_to_id = read_vocab('./dataset/cnews.vocab.txt')
        self.model = TextRNN()
        self.model.load_state_dict(torch.load('best_model.pt'))
 
    def predict(self, message):
        content = message
        data = [self.word_to_id[x] for x in content if x in self.word_to_id]
        data = kr.preprocessing.sequence.pad_sequences([data], 600)
        data = torch.LongTensor(data)
        y_pred_cls = self.model(data)
        class_index = torch.argmax(y_pred_cls[0]).item()
        return self.categories[class_index]
    
if __name__ == '__main__':
    model = RnnModel()
    test_demo = ['《时光重返四十二难》恶搞唐增取经一款时下最热门的动画人物：猪猪侠，加上创新的故事背景，震撼的操作快感，成就了这部恶搞新作，现正恶搞上市，玩家们抢先赶快体验快感吧。游戏简介：被时光隧道传送到208年的猪猪侠，必须经历六七四十二难的考验，才能借助柯伊诺尔大钻石的力量，开启时光隧道，重返2008年。在迷糊老师、菲菲公主的帮助下，猪猪侠接受了挑战，开始了这段充满了关心和情谊的旅程。    更多精彩震撼感觉，立即下载该款游戏尽情体验吧。玩家交流才是王道，讯易游戏玩家交流中心 QQ群：6306852-----------------生活要有激情，游戏要玩多彩(多彩游戏)。Colourfulgame (多彩游戏)，让你看看快乐游戏的颜色！精品推荐：1：《钟馗传》大战无头关羽，悲壮的剧情伴随各朝英灵反攻地府！2：《中华群英》将和赵云，项羽，岳飞等猛将作战，穿越各朝代抗击日寇。良品推荐：1：《赌王争霸之斗地主》易飞会在四角恋中会选择谁？是否最终成赌神呢？2：勇者后裔和魔王紧缠一起，前代恩怨《圣火伏魔录》将为您揭示一切。  3：颠覆传统概念，恶搞+非主流？！誓必弄死搞残为止《爆笑飞行棋》。4：《中国象棋残局大师》快棋和人机模式让畅快对弈！一切“多彩游戏”资讯，点击Colourfulgame官网http://www.colourfulgame.com一切“多彩游戏”感言，交流Colourfulgame论坛http://121.33.203.124/forum/【客服邮箱】：xunyiwangluo@126.com">xunyiwangluo@126.com">xunyiwangluo@126.com【客服热线】：020-87588437']
                 
    for i in test_demo:
      print(i,":",model.predict(i))

《时光重返四十二难》恶搞唐增取经一款时下最热门的动画人物：猪猪侠，加上创新的故事背景，震撼的操作快感，成就了这部恶搞新作，现正恶搞上市，玩家们抢先赶快体验快感吧。游戏简介：被时光隧道传送到208年的猪猪侠，必须经历六七四十二难的考验，才能借助柯伊诺尔大钻石的力量，开启时光隧道，重返2008年。在迷糊老师、菲菲公主的帮助下，猪猪侠接受了挑战，开始了这段充满了关心和情谊的旅程。    更多精彩震撼感觉，立即下载该款游戏尽情体验吧。玩家交流才是王道，讯易游戏玩家交流中心 QQ群：6306852-----------------生活要有激情，游戏要玩多彩(多彩游戏)。Colourfulgame (多彩游戏)，让你看看快乐游戏的颜色！精品推荐：1：《钟馗传》大战无头关羽，悲壮的剧情伴随各朝英灵反攻地府！2：《中华群英》将和赵云，项羽，岳飞等猛将作战，穿越各朝代抗击日寇。良品推荐：1：《赌王争霸之斗地主》易飞会在四角恋中会选择谁？是否最终成赌神呢？2：勇者后裔和魔王紧缠一起，前代恩怨《圣火伏魔录》将为您揭示一切。  3：颠覆传统概念，恶搞+非主流？！誓必弄死搞残为止《爆笑飞行棋》。4：《中国象棋残局大师》快棋和人机模式让畅快对弈！一切“多彩游戏”资讯，点击Colourfulgame官网http://www.colourfulgame.com一切“多彩游戏”感言，交流Colourfulgame论坛http://121.33.203.124/forum/【客服邮箱】：xunyiwangluo@126.com">xunyiwangluo@126.com">xunyiwangluo@126.com【客服热线】：020-87588437 : 游戏
