## 文本分类

### 导入库

In [1]:
import torchtext
from torchtext.datasets import AG_NEWS
import torch
import torch.nn as nn  
import matplotlib.pyplot as plt  
from tqdm import tqdm  
import numpy as np   

  from .autonotebook import tqdm as notebook_tqdm


### 下载数据集以及统计数据集的形式  

In [17]:
#下载数据集
train_data = AG_NEWS(split='train')
test_data = AG_NEWS(split='test')
for i in train_data:#看一下训练数据的结构
    print(i)
    break
#这里可以看出结构就是(class,text)
classes=[i[0] for i in train_data]
classes=list(set(classes))
print(classes)
class_num=len(classes)
#这里看出一共是四个类
num=0
text=""
for i in train_data:
    num+=1  
    text+=i[1]
average_length=len(text)/num  
print(average_length)
#这里可以得到平均长度(这里是字符串长度)

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")
[1, 2, 3, 4]
236.477525
120000


### 利用torchtext内置的工具进行文本预处理  

- 先定义一些文本处理工具

In [3]:
from torchtext.data.utils import get_tokenizer  
from torchtext.vocab import build_vocab_from_iterator  
#拿到分词器    
tokenizer = get_tokenizer('basic_english')  
#这里计算一下text的单词平均长度
length=0  
num=0  
for i in train_data:
    num+=1
    length+=len(tokenizer(i[1]))
print(length//num)
avg_len=length//num  
#创建词汇表  
vocabulary=build_vocab_from_iterator(map(lambda x:tokenizer(x[1]),train_data),specials=["|NONE|"])
vocabulary.set_default_index(vocabulary["|NONE|"])#设置未出现单词的默认索引  

43


- 定义loader
> 这里还需要进行预处理，即把对应的文本转换为embedding

In [4]:
import torch.utils
import torch.utils.data
from torch.nn.utils.rnn import pad_sequence
def preprocess(batch):#这里按批次处理数据  
    labels,texts=zip(*batch)
    labels=torch.tensor(labels)
    tokens=[]
    #这里要统一text长度，我是按照平均长度  
    for text in texts:
        #获取text长度
        l=len(tokenizer(text))
        add_l=avg_len-l  
        tmp=vocabulary(tokenizer(text))[:avg_len]  
        tmp+=[vocabulary["|NONE|"] for i in range(0,add_l)]
        tokens.append(tmp)
    tokens=torch.Tensor(tokens)
    return labels,tokens    
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, collate_fn=preprocess)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=32, collate_fn=preprocess)

### 定义评测函数  

In [26]:
def evaluate(model,loader):
    model.eval()
    right_num=0
    total_num=0
    for label,data in loader:
        outputs=model(data)
        outputs=outputs.reshape(-1,).detach().numpy().tolist()
        labels=label.reshape(-1,).detach().numpy().tolist()
        for i in range(len(labels)):
            total_num+=1
            if(labels[i]==outputs[i]):
                right_num+=1
    return right_num/total_num  

### 定义训练流程

In [35]:
def train(title,model,train_loader,test_loader,criterion,optimizer,epoch_num):
    loss_list=[]
    acc_list=[]
    print("Train {}".format(title))
    for epoch in range(epoch_num):
        model.train()
        tmp_loss=[]
        num=0
        for labels, texts in tqdm(train_loader):
            num+=1
            if(num>=50):
                break
            optimizer.zero_grad()
            outputs = model(texts)
            labels-=1#这里有坑因为给的标签是从1开始的
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            tmp_loss.append(float(loss))
        #获取这一轮的平均损失
        loss_list.append(sum(tmp_loss)/len(tmp_loss))
        #获取这一轮的在验证集上正确率
        acc_list.append(evaluate(model,test_loader))
    plt.plot(np.linspace(0,len(loss_list)+1,len(loss_list)),loss_list,c='r')
    plt.title(title)
    plt.show()
    plt.plot(np.linspace(0,len(acc_list)+1,len(acc_list)),acc_list,c='g')  
    plt.title(title)
    plt.show()  

### 基于全连接的模型

In [None]:
class FCModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, class_num):
        super(FCModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, class_num)
    def forward(self, x):
        x=x.long()
        x = self.embedding(x)
        x = x.mean(dim=1)  #使用平均池化
        x = self.fc(x)
        return x
fc_model = FCModel(len(vocabulary), 100, class_num)
criterion=nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(fc_model.parameters())
train("fc model",fc_model,train_loader,test_loader,criterion,optimizer,10)

### 基于CNN的模型

In [36]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, class_num):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv2d(1, 100, (3, embed_dim))
        self.fc = nn.Linear(100, class_num)
    def forward(self, x):
        x=x.long()
        x = self.embedding(x).unsqueeze(1)#添加通道维度
        x = self.conv1(x)
        x = nn.functional.relu(x)
        x = nn.functional.max_pool1d(x.squeeze(3), x.size(3)).squeeze(2)#池化
        x = self.fc(x)
        return x
cnn_model = CNNModel(len(vocabulary), 100, class_num)
criterion=nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters())
train("fc model",cnn_model,train_loader,test_loader,criterion,optimizer,10)

Train fc model


0it [00:00, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (3200x41 and 100x4)