In [1]:
import numpy as np
import pandas as pd
import torch
import os
import logging
import torchtext
from IPython.display import display
from trainer.supervised_trainer import SupervisedTrainer
from models.seq2seq import EncoderRNN, DecoderRNN, Seq2seq
from seq2seq.utils import SourceField, TargetField
from seq2seq.optim import Optimizer
from seq2seq.loss import Perplexity
from seq2seq.evaluator import Predictor
from torchtext.data import Field
from nltk.tokenize import RegexpTokenizer
from torchtext.data import TabularDataset
from seq2seq.utils import Checkpoint
import csv
import shutil



In [2]:
data_dir = '../data/'
file_name = 'train_data.csv'
dev_name = 'val_data.csv'
train_data = pd.read_csv(os.path.join(data_dir, file_name), encoding='utf-8')
display(train_data.head(n=2))
csv.field_size_limit(100000000)

Unnamed: 0,content,title,id
0,This was the year that felt like 50 years. We ...,21 Stories Our Readers Loved in 2017,0
1,Gary Vaynerchuk once told a 20 year old Taylor...,What To Do After Graduating College,1


131072

In [3]:
to_remove = "0123456789"
table = {ord(char): None for char in to_remove}
def tokenizer(sentences):
    sentences = sentences.lower()
#     sentences = sentences.translate(table)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentences)
    filtered_words = [w for w in tokens]
    return filtered_words

max_encoder_len = 800
min_decoder_len = 1
content, title = SourceField(tokenize=tokenizer), TargetField(tokenize=tokenizer)
def len_filter(example):
    return len(example.content) <= max_encoder_len and len(example.title) >= min_decoder_len

In [4]:
%%time
tv_datafields = [('content', content), ('title', title), ('id', None)]  # must order the data format with the csv file.
trn = TabularDataset(path=os.path.join(data_dir, file_name), 
                     format='csv', fields=tv_datafields, skip_header=True,
                     filter_pred=len_filter)

dev = TabularDataset(path=os.path.join(data_dir, dev_name),
                    format='csv', fields = tv_datafields, skip_header=True,
                    filter_pred=len_filter)

CPU times: user 3min 8s, sys: 20.1 s, total: 3min 28s
Wall time: 3min 27s


In [10]:
content.build_vocab(trn, max_size = 50000)
title.build_vocab(dev, max_size=20000)

In [11]:
display(content.vocab.freqs.most_common(10))
display(title.vocab.freqs.most_common(10))

[('the', 7215050),
 ('to', 3798408),
 ('and', 3324064),
 ('a', 3130495),
 ('of', 2999731),
 ('in', 2567508),
 ('s', 1543622),
 ('that', 1397337),
 ('for', 1383377),
 ('is', 1297595)]

[('<sos>', 16625),
 ('<eos>', 16625),
 ('to', 4879),
 ('the', 3629),
 ('in', 3310),
 ('s', 3285),
 ('of', 2753),
 ('for', 2532),
 ('a', 1957),
 ('and', 1853)]

In [13]:
input_vocab = content.vocab
output_vocab = title.vocab

**建立预训练的embedding matrix**

In [40]:
# load the pretrainde model here to initialize the embedding matrix here.
word_to_vec_path = '../data/embedding_matrix/glove/glove.6B.200d.txt'
def get_eng_vec(path= word_to_vec_path):
    word_to_vec = dict()
    with open(path, 'r') as f:
        for line in f:
            line=line.split(' ')
            word_to_vec[line[0]]= [float(f) for f in line[1:]]
    return word_to_vec

word_to_vec = get_eng_vec()

encoder_embedding_matrix = np.random.randn(len(content.vocab), 200)
display(encoder_embedding_matrix.shape)
unknow_words = []
for index in range(encoder_embedding_matrix.shape[0]):
    word = content.vocab.itos[index]
    try:
        vector = word_to_vec[word]
        encoder_embedding_matrix[index, ] = vector
    except KeyError:
        unknow_words.append(word)
print("find {} unknow words in encoder vocab".format(len(unknow_words)))
#########################################################################
#########################################################################
decoder_embedding_matrix = np.random.randn(len(title.vocab), 200)
display(decoder_embedding_matrix.shape)
unknow_words = 0
for index in range(decoder_embedding_matrix.shape[0]):
    word = title.vocab.itos[index]
    try: # try to find the word in the word_to_vec:
        vector = word_to_vec[word]
        decoder_embedding_matrix[index, ] = vector
    except KeyError:
        unknow_words += 1
        pass
print("find {} unknow words in word2vec".format(unknow_words))

np.save('../data/embedding_matrix/encoder_embedding_{}_200.npy'.format(len(content.vocab)), encoder_embedding_matrix)
np.save('../data/embedding_matrix/decoder_embedding_{}_200.npy'.format(len(title.vocab)),decoder_embedding_matrix)

(50002, 200)

find 2446 unknow words in encoder vocab


(20002, 200)

find 1186 unknow words in word2vec


In [42]:
# build the model here.
weight = torch.ones(len(title.vocab))
pad = title.vocab.stoi[title.pad_token]
loss = Perplexity(weight, pad)
seq2seq = None
optimizer = None
hidden_size = 100
bidirectional = True
embedding_dim = 200
# must notice to convert the embedding matrix to float32, by default, numpy
# just convert the data as float64, which is double format
encoder_embedding_matrix = torch.from_numpy(np.load('../data/embedding_matrix/encoder_embedding_50002_200.npy').astype('float32'))
decoder_embedding_matrix = torch.from_numpy(np.load('../data/embedding_matrix/decoder_embedding_20002_200.npy').astype('float32'))
print(decoder_embedding_matrix.shape)
print(len(output_vocab))
# add the pretrained embedding here
# encoder_embedding = torch.from_numpy(np.load('../data/encoder_embedding_50000_100.npy'))
# decoder_embedding = torch.from_numpy(np.load('../data/decoder_embedding_50000_100.npy'))
# display(encoder_embedding.to('cuda'))

torch.Size([20002, 200])
20002




In [43]:
encoder = EncoderRNN(len(content.vocab), max_encoder_len, hidden_size, bidirectional=bidirectional, 
                     dropout_p=0.2, n_layers=2, variable_lengths=True,embedding=encoder_embedding_matrix,
                     embedding_dim = embedding_dim, update_embedding=True)
decoder = DecoderRNN(len(title.vocab), 20, embedding_dim, hidden_size*2 if bidirectional else hidden_size, dropout_p=0.2, n_layers=2, use_attention=True, 
                     bidirectional=bidirectional, eos_id = title.eos_id, sos_id = title.sos_id, embedding=decoder_embedding_matrix)

In [44]:
loss.cuda()
my_seq2seq =Seq2seq(encoder, decoder)
my_seq2seq.cuda()

In [45]:
for param in my_seq2seq.parameters():
    param.data.uniform_(-0.08, 0.08)

In [46]:
# device = 0 to use the GPU:0
t = SupervisedTrainer(loss = loss, batch_size=40, checkpoint_every=3e4, print_every=100, expt_dir='../data', device=0)

In [None]:
my_seq2seq = t.train(my_seq2seq, trn, num_epochs=2, optimizer=optimizer, teacher_forcing_ratio=0.5)

torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32
torch.float32
cpu
torch.float32


In [None]:
from torch.autograd import Variable


class Predictor(object):

    def __init__(self, model, src_vocab, tgt_vocab):
        """
        Predictor class to evaluate for a given model.
        Args:
            model (seq2seq.models): trained model. This can be loaded from a checkpoint
                using `seq2seq.util.checkpoint.load`
            src_vocab (seq2seq.dataset.vocabulary.Vocabulary): source sequence vocabulary
            tgt_vocab (seq2seq.dataset.vocabulary.Vocabulary): target sequence vocabulary
        """
        if torch.cuda.is_available():
            self.model = model.cuda()
        else:
            self.model = model.cpu()
        self.model.eval()
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def get_decoder_features(self, src_seq):
        src_id_seq = torch.LongTensor([self.src_vocab.stoi[tok] for tok in src_seq]).view(1, -1)
        if torch.cuda.is_available():
            src_id_seq = src_id_seq.cuda()

        with torch.no_grad():
            softmax_list, _, other = self.model(src_id_seq, [len(src_seq)])

        return other

    def predict(self, src_seq):
        """ Make prediction given `src_seq` as input.

        Args:
            src_seq (list): list of tokens in source language

        Returns:
            tgt_seq (list): list of tokens in target language as predicted
            by the pre-trained model
        """
        other = self.get_decoder_features(src_seq)

        length = other['length'][0]

        tgt_id_seq = [other['sequence'][di][0].data[0] for di in range(length)]
        tgt_seq = [self.tgt_vocab.itos[tok] for tok in tgt_id_seq]
        return tgt_seq

    def predict_n(self, src_seq, n=1):
        """ Make 'n' predictions given `src_seq` as input.

        Args:
            src_seq (list): list of tokens in source language
            n (int): number of predicted seqs to return. If None,
                     it will return just one seq.

        Returns:
            tgt_seq (list): list of tokens in target language as predicted
                            by the pre-trained model
        """
        other = self.get_decoder_features(src_seq)

        result = []
        for x in range(0, int(n)):
            length = other['topk_length'][0][x]
            tgt_id_seq = [other['topk_sequence'][di][0, x, 0].data[0] for di in range(length)]
            tgt_seq = [self.tgt_vocab.itos[tok] for tok in tgt_id_seq]
            result.append(tgt_seq)

        return result

In [None]:
predicor = Predictor(my_seq2seq, input_vocab, output_vocab)

In [None]:
test_data = pd.read_csv('../data/test_data.csv', encoding='utf-8')
test_contents = list(test_data.content)
# test_titles = list(test_data.title)[:20]
# just use the former 20 sets as the 
test_contents =  [tokenizer(content) for content in test_contents]
test_results = []
for index, content_ in enumerate(test_contents):
    test_title = ' '.join(predicor.predict(content_))
    test_results.append(test_title)

In [None]:
for index, title_ in enumerate(test_results):
    print(title_)
    if index == 20:
        break

In [None]:
test_results = [item[:-6] for item in test_results]
for index, title_ in enumerate(test_results):
    print(title_)
    if index == 20:
        break

In [None]:
test_data_dir = '../data/result'
if not os.path.exists(test_data_dir):
    os.mkdir(test_data_dir)
else:
    shutil.rmtree(test_data_dir)
    os.mkdir(test_data_dir)

for i in range(len(test_results)):
    with open(os.path.join(test_data_dir, str(i+1)+'.txt'), 'w') as f:
        f.write(test_results[i])

In [None]:
# to check the unk and pad token in the 
display(content.vocab)
display(title.vocab)

In [None]:
display(len(title.vocab))  # the len method will drop the eos and sos token in the sentneces