数据阶段
---
包括数据的下载，数据的预处理

In [1]:
import re
import os
import random
import tarfile
import urllib
from torchtext import data
import argparse
import datetime
import torch
import torchtext.data as data
import torchtext.datasets as datasets
import sys

In [2]:
class TarDataset(data.Dataset):
    @classmethod
    def download_or_unzip(cls, root):
        path = os.path.join(root, cls.dirname)
        if not os.path.isdir(path):
            tpath = os.path.join(root, cls.filename)
            if not os.path.isfile(tpath):
                print('downloading')
                urllib.request.urlretrieve(cls.url, tpath)
            with tarfile.open(tpath, 'r') as tfile:
                print('extracting')
                tfile.extractall(root)
        return os.path.join(path, '')


class MR(TarDataset):

    url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
    filename = 'rt-polaritydata.tar'
    dirname = 'rt-polaritydata'

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        def clean_str(string):
            """
            数据预处理，使用正则化进行处理数据
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        
        # 确定数据的格式
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
                examples += [data.Example.fromlist([line, 'negative'], fields) for line in f]
            with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
                examples += [data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs):
        path = cls.download_or_unzip(root)
        examples = cls(text_field, label_field, path=path, **kwargs).examples
        
        # 重新分布数据
        if shuffle: 
            random.shuffle(examples)
        dev_index = -1 * int(dev_ratio*len(examples))

        return (cls(text_field, label_field, examples=examples[:dev_index]),
                cls(text_field, label_field, examples=examples[dev_index:]))

In [3]:
# 导入配置文件

parser = argparse.ArgumentParser(description='CNN text classificer')
# learning
parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]')
parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]')
parser.add_argument('-batch-size', type=int, default=64, help='batch size for training [default: 64]')
parser.add_argument('-log-interval',  type=int, default=1,   help='how many steps to wait before logging training status [default: 1]')
parser.add_argument('-test-interval', type=int, default=100, help='how many steps to wait before testing [default: 100]')
parser.add_argument('-save-interval', type=int, default=500, help='how many steps to wait before saving [default:500]')
parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot')
parser.add_argument('-early-stop', type=int, default=1000, help='iteration numbers to stop without performance increasing')
parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance')
# data 
parser.add_argument('-shuffle', action='store_true', default=False, help='shuffle the data every epoch')
# model
parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]')
parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]')
parser.add_argument('-embed-dim', type=int, default=128, help='number of embedding dimension [default: 128]')
parser.add_argument('-kernel-num', type=int, default=100, help='number of each kind of kernel')
parser.add_argument('-kernel-sizes', type=str, default='3,4,5', help='comma-separated kernel size to use for convolution')
parser.add_argument('-static', action='store_true', default=False, help='fix the embedding')
# device
parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]')
# option
parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]')
parser.add_argument('-predict', type=str, default=None, help='predict the sentence given')
parser.add_argument('-test', action='store_true', default=False, help='train or test')


_StoreTrueAction(option_strings=['-test'], dest='test', nargs=0, const=True, default=False, type=None, choices=None, help='train or test', metavar=None)

In [4]:
# load MR dataset
def mr(text_field, label_field, **kargs):
    train_data, dev_data =MR.splits(text_field, label_field)
    text_field.build_vocab(train_data, dev_data)
    label_field.build_vocab(train_data, dev_data)
    train_iter, dev_iter = data.Iterator.splits(
                                (train_data, dev_data), 
                                batch_sizes=(args.batch_size, len(dev_data)),
                                **kargs)
    return train_iter, dev_iter

In [5]:
# 在jupyter notebook 中，
args = parser.parse_args(args=[])

In [6]:
print("Loading dataset....")
text_field=data.Field(lower=True)
label_field=data.Field(sequential=False)

train_iter,dev_iter=mr(text_field,label_field,device=-1,repeat=False)

Loading dataset....


The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [9]:
# 查看数据
print(list(train_iter)[0])
# 查看词汇数量
len(text_field.vocab)


[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.LongTensor of size 47x64]
	[.label]:[torch.LongTensor of size 64]


21108

In [10]:
# 更新预设的超参数
args.embed_num=len(text_field.vocab)
args.class_num=len(label_field.vocab)-1

# 设置kernel_size
args.kernel_sizes=[int(k) for k in args.kernel_sizes.split(',')]
args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

# 打印出模型设计的超参数
print("\nParameters:")
for attr, value in sorted(args.__dict__.items()):
    print("\t{}={}".format(attr.upper(), value))


Parameters:
	BATCH_SIZE=64
	CLASS_NUM=2
	DEVICE=-1
	DROPOUT=0.5
	EARLY_STOP=1000
	EMBED_DIM=128
	EMBED_NUM=21108
	EPOCHS=256
	KERNEL_NUM=100
	KERNEL_SIZES=[3, 4, 5]
	LOG_INTERVAL=1
	LR=0.001
	MAX_NORM=3.0
	PREDICT=None
	SAVE_BEST=True
	SAVE_DIR=snapshot\2019-07-09_17-21-42
	SAVE_INTERVAL=500
	SHUFFLE=False
	SNAPSHOT=None
	STATIC=False
	TEST=False
	TEST_INTERVAL=100


模型设计
----
关于text-cnn模型的设计

In [11]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [12]:
# 模型设计
class CNN_Text(nn.Module):
    def __init__(self, args):
        super(CNN_Text, self).__init__()
        self.args = args

        embed_num = args.embed_num
        embed_dim = args.embed_dim
        class_num = args.class_num
        input_channels = 1
        kernel_num = args.kernel_num
        kernel_sizes = args.kernel_sizes
        # 初始化embedding
        self.embed = nn.Embedding(embed_num, embed_dim)
        # 类似于append()
        self.convs1 = nn.ModuleList([nn.Conv2d(input_channels , kernel_num, (K, embed_dim)) for K in kernel_sizes])
        # 设置dropout层
        self.dropout = nn.Dropout(args.dropout)
        # 最后的全连接层
        self.fc1 = nn.Linear(len(kernel_sizes)*kernel_num, class_num)

    def conv_and_pool(self, x, conv):
        # 激活函数
        x = F.relu(conv(x)).squeeze(3)
        # 最大池化
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        x = self.embed(x)
        if self.args.static:
            x = Variable(x)
        x = x.unsqueeze(1)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logit = self.fc1(x)
        return logit


模型训练以及初始化
----


In [13]:
cnn=CNN_Text(args)
if args.snapshot is not None:
    print("Loading model from {} .....".format(args.snapshot))
    cnn.load_state_dict(torch.load(args.snapshot))

In [24]:
def save(model, save_dir, save_prefix, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
    torch.save(model.state_dict(), save_path)

In [34]:
# 模型训练四步法
def train(train_iter, dev_iter, model, args):
    """
    train的四个步骤要遵守
    """
    # 确定优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, args.epochs+1):
        for batch in train_iter:
            feature, target = batch.text, batch.label
            feature=feature.data.t()
            target=target.data.sub(1)  # batch first, index align

            optimizer.zero_grad()
            logit = model(feature)
            # 确定损失函数为交叉熵损失函数
            loss = F.cross_entropy(logit, target)
            loss.backward()
            optimizer.step()

            steps += 1
            # 输出日志
            if steps % args.log_interval == 0:
                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
                accuracy = 100.0 * corrects/batch.batch_size
                sys.stdout.write(
                    '\rBatch[{}] - loss: {}  acc: {}%({}/{})'.format(steps, 
                                                                             loss.item,
                                                                             accuracy,
                                                                             corrects,
                                                                             batch.batch_size))
            if steps % args.test_interval == 0:
                dev_acc = eval(dev_iter, model, args)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    if args.save_best:
                        save(model, args.save_dir, 'best', steps)
                else:
                    if steps - last_step >= args.early_stop:
                        print('early stop by {} steps.'.format(args.early_stop))
            elif steps % args.save_interval == 0:
                save(model, args.save_dir, 'snapshot', steps)


In [None]:
def eval(data_iter, model, args):
    """
    验证
    """
    model.eval()
    corrects, avg_loss = 0, 0
    for batch in data_iter:
        feature, target = batch.text, batch.label
        feature=feature.data.t()
        target=target.data.sub(1)  # batch first, index align

        logit = model(feature)
        loss = F.cross_entropy(logit, target, size_average=False)

        avg_loss += loss.item()
        corrects += (torch.max(logit, 1)
                     [1].view(target.size()).data == target.data).sum()

    size = len(data_iter.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects/size
    print('\nEvaluation - loss: {}  acc: {}%({}/{}) \n'.format(avg_loss, 
                                                                       accuracy, 
                                                                       corrects, 
                                                                       size))
    return accuracy


def predict(text, model, text_field, label_feild, cuda_flag):
    """
    预测
    """
    assert isinstance(text, str)
    model.eval()
    # text = text_field.tokenize(text)
    text = text_field.preprocess(text)
    text = [[text_field.vocab.stoi[x] for x in text]]
    x = torch.tensor(text)
    x = autograd.Variable(x)
    if cuda_flag:
        x = x.cuda()
    print(x)
    output = model(x)
    _, predicted = torch.max(output, 1)
    #return label_feild.vocab.itos[predicted.data[0][0]+1]
    return label_feild.vocab.itos[predicted.data[0]+1]

In [None]:
# 开始训练
train(train_iter,dev_iter,cnn,args)

Batch[100] - loss: <built-in method item of Tensor object at 0x0000013E92FE6948>  acc: 64%(41/64)




Evaluation - loss: 0.599169  acc: 67.0000%(720/1066) 

Batch[200] - loss: <built-in method item of Tensor object at 0x0000013E92FE9AB0>  acc: 95%(61/64))
Evaluation - loss: 0.591676  acc: 70.0000%(751/1066) 

Batch[300] - loss: <built-in method item of Tensor object at 0x0000013E92FF4048>  acc: 93%(56/60)
Evaluation - loss: 0.574262  acc: 70.0000%(751/1066) 

Batch[400] - loss: <built-in method item of Tensor object at 0x0000013E92FE9AB0>  acc: 100%(64/64)
Evaluation - loss: 0.608738  acc: 71.0000%(766/1066) 

Batch[500] - loss: <built-in method item of Tensor object at 0x0000013E92FE6900>  acc: 100%(64/64)
Evaluation - loss: 0.635403  acc: 72.0000%(776/1066) 

Batch[600] - loss: <built-in method item of Tensor object at 0x0000013E92FF4828>  acc: 100%(60/60)
Evaluation - loss: 0.665352  acc: 71.0000%(762/1066) 

Batch[700] - loss: <built-in method item of Tensor object at 0x0000013E92FF4168>  acc: 100%(64/64)
Evaluation - loss: 0.699952  acc: 72.0000%(773/1066) 

Batch[800] - loss: <b

early stop by 1000 steps.
Batch[4900] - loss: <built-in method item of Tensor object at 0x0000013E92FE6CF0>  acc: 100%(64/64)
Evaluation - loss: 1.059559  acc: 72.0000%(777/1066) 

early stop by 1000 steps.
Batch[5000] - loss: <built-in method item of Tensor object at 0x0000013E92FE9AB0>  acc: 100%(64/64)
Evaluation - loss: 1.061943  acc: 72.0000%(778/1066) 

early stop by 1000 steps.
Batch[5062] - loss: <built-in method item of Tensor object at 0x0000013E92FE9E10>  acc: 100%(64/64)