In [3]:
# 下载数据集
import texar.torch as tx
import os
root = './data'
source_file = os.path.join(root,'sources.txt')
target_file = os.path.join(root,'targets.txt')

In [29]:
# 设置参数
import torch
import texar.torch as tx

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_epochs = 4
display = 50

source_vocab_file = './data/vocab.sources.txt'
target_vocab_file = './data/vocab.targets.txt'
train = {
    'batch_size':32,
    'source_dataset':{
        'files':source_file,
        'vocab_file':source_vocab_file
    },
    'target_dataset':{
        'files':target_file,
        'vocab_file':target_vocab_file
    }
}
num_units = 256
embedder = {
    'dim': num_units
}
decoders = {
    'rnn_cell': {
        'kwargs': {
            'num_units': num_units
        },
    },
    'attention': {
        'kwargs': {
            'num_units': num_units,
        },
        'attention_layer_size': num_units
    },
    'max_decoding_length_infer': 60,
}
encoders = {
    'rnn_cell_fw': {
        'kwargs': {
            'num_units': num_units
        }
    }
}

In [30]:
# 处理数据

train_data = tx.data.PairedTextData(hparams=train,device = device)


In [31]:
type(train_data)
train_data.source_vocab.size

24

In [40]:
import torch
import torch.nn as nn
import texar.torch as tx


class Seq2SeqAttn(nn.Module):
    def __init__(self,train_data):
        super(Seq2SeqAttn,self).__init__()
        self.source_vocab_size = train_data.source_vocab.size
        self.target_vocab_size = train_data.target_vocab.size
        
        self.bos_token_id = train_data.target_vocab.bos_token_id
        self.eos_token_id = train_data.target_vocab.eos_token_id
        
        self.source_embedder = tx.modules.WordEmbedder(vocab_size=self.source_vocab_size,hparams = embedder)
        self.target_embedder = tx.modules.WordEmbedder(vocab_size=self.target_vocab_size,hparams = embedder)
        self.encoder = tx.modules.BidirectionalRNNEncoder(
            input_size = self.source_embedder.dim,
            hparams=encoder)
        self.decoder = tx.modules.AttentionRNNDecoder(
            token_embedder=self.target_embedder,
            encoder_output_size=(self.encoder.cell_fw.hidden_size +
                                 self.encoder.cell_bw.hidden_size),
            input_size=self.target_embedder.dim,
            vocab_size=self.target_vocab_size,
            hparams=decoders)
    def forward(self,batch,mode):
        enc_outputs,_ = self.encoder(
            inputs = self.source_embedder(batch['source_text_ids']),
            sequence_length = batch['source_length']
        )
        memory = torch.cat(enc_outputs,dim=2)
        
        if mode == "train":
            helper_train = self.decoder.create_helper(decoding_strategy = "train_greedy")
            training_outputs,_,_ = self.decoder(memory = memory,
                                               memory_sequence_length=batch['source_length'],
                                               helper = helper_train,
                                               inputs = batch['target_text_ids'][:,:-1],
                                               sequence_length = batch['target_length'] - 1)
            mle_loss = tx.losses.sequence_sparse_softmax_cross_entropy(labels=batch['target_text_ids'][:,1:],
                                                                      logits=training_outputs.logits,
                                                                      sequence_length=batch['target_length'] - 1
                                                                      )
            return mle_loss

In [41]:
model = Seq2SeqAttn(train_data)
model.to(device)

Seq2SeqAttn(
  (source_embedder): WordEmbedder(
    vocab_size=24, embedding_dim=256
    (_dropout_layer): EmbeddingDropout()
  )
  (target_embedder): WordEmbedder(
    vocab_size=24, embedding_dim=256
    (_dropout_layer): EmbeddingDropout()
  )
  (encoder): BidirectionalRNNEncoder(
    (_cell_fw): LSTMCell(
      (_cell): LSTMCell(256, 256)
    )
    (_cell_bw): LSTMCell(
      (_cell): LSTMCell(256, 256)
    )
  )
  (decoder): AttentionRNNDecoder(
    (_token_embedder): WordEmbedder(
      vocab_size=24, embedding_dim=256
      (_dropout_layer): EmbeddingDropout()
    )
    (_cell): AttentionWrapper(
      (_cell): LSTMCell(
        (_cell): LSTMCell(512, 256)
      )
      (_attention_layers): ModuleList(
        (0): Linear(in_features=768, out_features=256, bias=False)
      )
    )
    (_output_layer): Linear(in_features=256, out_features=24, bias=True)
    (attention_mechanism): LuongAttention(
      (_memory_layer): Linear(in_features=512, out_features=256, bias=False)
    )
 

In [49]:
data_iterator = tx.data.TrainTestDataIterator(train=train_data,test=train_data,val = train_data)
opt = {
    'optimizer': {
        'type':  'Adam',
        'kwargs': {
            'lr': 0.001,
        },
    },
}
train_op = tx.core.get_train_op(params=model.parameters(),hparams=opt)
def _train_epoch():
    data_iterator.switch_to_train_data()
    model.train()
    step = 0
    for batch in data_iterator:
        loss = model(batch,mode = "train")
        loss.backward()
        train_op()
        if step % 100 == 0:
            print("step = {},loss={:.4f}".format(step,loss))
        step += 1

In [50]:
for i in range(100):
    _train_epoch()
    break

step = 0,loss=39.2355
step = 100,loss=1.3169
step = 200,loss=0.7944
step = 300,loss=1.4363


AttributeError: '_SPDataLoaderIter' object has no attribute 'dataset'