In [1]:
import numpy as np
import pandas as pd
import torch
import os
import logging
import torchtext
from IPython.display import display
from trainer import SupervisedTrainer
from models import EncoderRNN, DecoderRNN, Seq2seq
from dataset import SourceField, TargetField
from optim import Optimizer
from loss import Perplexity
from evaluator import Predictor
from torchtext.data import Field
from nltk.tokenize import RegexpTokenizer
from torchtext.data import TabularDataset
from util.checkpoint import Checkpoint
import csv
import shutil



In [2]:
data_dir = '../data/'
file_name = 'train_data.csv'
dev_name = 'val_data.csv'
train_data = pd.read_csv(os.path.join(data_dir, file_name), encoding='utf-8')
display(train_data.head(n=2))
csv.field_size_limit(100000000)

Unnamed: 0,content,title,id
0,This was the year that felt like 50 years. We ...,21 Stories Our Readers Loved in 2017,0
1,Gary Vaynerchuk once told a 20 year old Taylor...,What To Do After Graduating College,1


131072

In [3]:
def tokenizer(sentences):
    sentences = sentences.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentences)
    filtered_words = [w for w in tokens]
    return filtered_words

max_encoder_len = 800
min_decoder_len = 1
content, title = SourceField(tokenize=tokenizer), TargetField(tokenize=tokenizer)
def len_filter(example):
    return len(example.content) <= max_encoder_len and len(example.title) >= min_decoder_len

In [4]:
%%time
tv_datafields = [('content', content), ('title', title), ('id', None)]  # must order the data format with the csv file.
trn = TabularDataset(path=os.path.join(data_dir, file_name), 
                     format='csv', fields=tv_datafields, skip_header=True,
                     filter_pred=len_filter)

dev = TabularDataset(path=os.path.join(data_dir, dev_name),
                    format='csv', fields = tv_datafields, skip_header=True,
                    filter_pred=len_filter)

CPU times: user 2min 31s, sys: 10.5 s, total: 2min 41s
Wall time: 2min 41s


In [5]:
content.build_vocab(trn, max_size = 50000)
title.build_vocab(trn, max_size=50000)

In [6]:
display(content.vocab.freqs.most_common(10))
display(title.vocab.freqs.most_common(10))

[('the', 7215050),
 ('to', 3798408),
 ('and', 3324064),
 ('a', 3130495),
 ('of', 2999731),
 ('in', 2567508),
 ('s', 1543622),
 ('that', 1397337),
 ('for', 1383377),
 ('is', 1297595)]

[('<sos>', 322358),
 ('<eos>', 322358),
 ('to', 93079),
 ('the', 71872),
 ('in', 64768),
 ('s', 61315),
 ('of', 51853),
 ('for', 47829),
 ('a', 40793),
 ('and', 36461)]

In [7]:
input_vocab = content.vocab
output_vocab = title.vocab

In [8]:
# build the model here.
weight = torch.ones(len(title.vocab))
pad = title.vocab.stoi[title.pad_token]
loss = Perplexity(weight, pad)
loss.cuda()
seq2seq = None
optimizer = None
hidden_size = 100
bidirectional = True
# add the pretrained embedding here
# encoder_embedding = torch.from_numpy(np.load('../data/encoder_embedding_50000_100.npy'))
# decoder_embedding = torch.from_numpy(np.load('../data/decoder_embedding_50000_100.npy'))
# display(encoder_embedding.to('cuda'))



In [9]:
encoder = EncoderRNN(len(content.vocab), max_encoder_len, hidden_size, bidirectional=bidirectional, dropout_p=0.2, n_layers=2,
                     variable_lengths=True, update_embedding=True)
decoder = DecoderRNN(len(title.vocab), 20, hidden_size*2 if bidirectional else hidden_size, dropout_p=0.2, n_layers=2, use_attention=True, 
                     bidirectional=bidirectional, eos_id = title.eos_id, sos_id = title.sos_id)

In [10]:
my_seq2seq =Seq2seq(encoder, decoder)
my_seq2seq.cuda()

Seq2seq(
  (encoder): EncoderRNN(
    (input_dropout): Dropout(p=0)
    (embedding): Embedding(50002, 100)
    (rnn): GRU(100, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (decoder): DecoderRNN(
    (input_dropout): Dropout(p=0)
    (rnn): GRU(200, 200, num_layers=2, batch_first=True, dropout=0.2)
    (embedding): Embedding(50002, 200)
    (attention): Attention(
      (linear_out): Linear(in_features=400, out_features=200, bias=True)
    )
    (out): Linear(in_features=200, out_features=50002, bias=True)
  )
)

In [11]:
for param in my_seq2seq.parameters():
    param.data.uniform_(-0.08, 0.08)

In [12]:
t = SupervisedTrainer(loss = loss, batch_size=40, checkpoint_every=3e4, print_every=6e2, expt_dir='../data', device=0)

In [31]:
my_seq2seq = t.train(my_seq2seq, trn, num_epochs=4, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5)

Progress: 3%, Train Perplexity: 107.9859
Progress: 5%, Train Perplexity: 60.6364
Progress: 7%, Train Perplexity: 61.6174
Progress: 9%, Train Perplexity: 61.7865
Progress: 11%, Train Perplexity: 65.2091
Progress: 13%, Train Perplexity: 64.2776
Progress: 14%, Train Perplexity: 62.4434
Progress: 16%, Train Perplexity: 64.6698
Progress: 18%, Train Perplexity: 65.5042
Progress: 20%, Train Perplexity: 64.4328
Progress: 22%, Train Perplexity: 65.2106
Progress: 24%, Train Perplexity: 65.5941


  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


Finished epoch 1: Train Perplexity: 62.4367, Dev Perplexity: 522.2506, Accuracy: 0.1508
Progress: 26%, Train Perplexity: 54.5309
Progress: 27%, Train Perplexity: 46.1106
Progress: 29%, Train Perplexity: 46.8940
Progress: 31%, Train Perplexity: 49.2396
Progress: 33%, Train Perplexity: 50.0597
Progress: 35%, Train Perplexity: 48.6186
Progress: 37%, Train Perplexity: 53.5088
Progress: 39%, Train Perplexity: 54.0555
Progress: 40%, Train Perplexity: 52.5691
Progress: 42%, Train Perplexity: 56.2621
Progress: 44%, Train Perplexity: 54.5354
Progress: 46%, Train Perplexity: 56.5594
Progress: 48%, Train Perplexity: 56.0629
Finished epoch 2: Train Perplexity: 51.9704, Dev Perplexity: 530.9379, Accuracy: 0.1518
Progress: 50%, Train Perplexity: 54.4689
Progress: 52%, Train Perplexity: 40.1282
Progress: 53%, Train Perplexity: 39.7388
Progress: 55%, Train Perplexity: 43.3713
Progress: 57%, Train Perplexity: 44.5111
Progress: 59%, Train Perplexity: 46.4257
Progress: 61%, Train Perplexity: 45.8141
Prog

In [14]:
from torch.autograd import Variable


class Predictor(object):

    def __init__(self, model, src_vocab, tgt_vocab):
        """
        Predictor class to evaluate for a given model.
        Args:
            model (seq2seq.models): trained model. This can be loaded from a checkpoint
                using `seq2seq.util.checkpoint.load`
            src_vocab (seq2seq.dataset.vocabulary.Vocabulary): source sequence vocabulary
            tgt_vocab (seq2seq.dataset.vocabulary.Vocabulary): target sequence vocabulary
        """
        if torch.cuda.is_available():
            self.model = model.cuda()
        else:
            self.model = model.cpu()
        self.model.eval()
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def get_decoder_features(self, src_seq):
        src_id_seq = torch.LongTensor([self.src_vocab.stoi[tok] for tok in src_seq]).view(1, -1)
        if torch.cuda.is_available():
            src_id_seq = src_id_seq.cuda()

        with torch.no_grad():
            softmax_list, _, other = self.model(src_id_seq, [len(src_seq)])

        return other

    def predict(self, src_seq):
        """ Make prediction given `src_seq` as input.

        Args:
            src_seq (list): list of tokens in source language

        Returns:
            tgt_seq (list): list of tokens in target language as predicted
            by the pre-trained model
        """
        other = self.get_decoder_features(src_seq)

        length = other['length'][0]

        tgt_id_seq = [other['sequence'][di][0].data[0] for di in range(length)]
        tgt_seq = [self.tgt_vocab.itos[tok] for tok in tgt_id_seq]
        return tgt_seq

    def predict_n(self, src_seq, n=1):
        """ Make 'n' predictions given `src_seq` as input.

        Args:
            src_seq (list): list of tokens in source language
            n (int): number of predicted seqs to return. If None,
                     it will return just one seq.

        Returns:
            tgt_seq (list): list of tokens in target language as predicted
                            by the pre-trained model
        """
        other = self.get_decoder_features(src_seq)

        result = []
        for x in range(0, int(n)):
            length = other['topk_length'][0][x]
            tgt_id_seq = [other['topk_sequence'][di][0, x, 0].data[0] for di in range(length)]
            tgt_seq = [self.tgt_vocab.itos[tok] for tok in tgt_id_seq]
            result.append(tgt_seq)

        return result

In [28]:
predicor = Predictor(my_seq2seq, input_vocab, output_vocab)

In [32]:
test_data = pd.read_csv('../data/test_data.csv', encoding='utf-8')
test_contents = list(test_data.content)
# test_titles = list(test_data.title)[:20]
# just use the former 20 sets as the 
test_contents =  [tokenizer(content) for content in test_contents]
test_results = []
for index, content_ in enumerate(test_contents):
    test_title = ' '.join(predicor.predict(content_))
    test_results.append(test_title)

In [33]:
for index, title_ in enumerate(test_results):
    print(title_)
    if index == 20:
        break

how a mom is murder in a crime <eos>
liverpool fans have liverpool ever ever liverpool <eos>
these people are people are to to <eos>
you are the most you you <eos>
the are are the <eos>
what 5 things you should know about <eos>
japanese s abe is a s s s s s in <eos>
amazon s new is new with with with <eos>
there s a flight for the delta and to <eos>
23 years now now you get free serving you <eos>
gordon ramsay is vegan vegan vegan to to to <eos>
krispy kreme is kreme kreme kreme kreme kreme kreme kreme <eos>
get a out sized bag packs 20 percent of a m s <eos>
internet starbucks launches starbucks starbucks clothing store <eos>
amazon s amazon to a delivery <eos>
<unk> 5 5 to <eos>
the 18 day of <eos>
khloé kardashian s her khloé her her her her her <eos>
the best spot for the weirdest camera videos <eos>
the funniest craziest cats you hair in your red cats <eos>
the dog dog dog dog dog dog with <eos>


In [34]:
test_results = [item[:-6] for item in test_results]
for index, title_ in enumerate(test_results):
    print(title_)
    if index == 20:
        break

how a mom is murder in a crime
liverpool fans have liverpool ever ever liverpool
these people are people are to to
you are the most you you
the are are the
what 5 things you should know about
japanese s abe is a s s s s s in
amazon s new is new with with with
there s a flight for the delta and to
23 years now now you get free serving you
gordon ramsay is vegan vegan vegan to to to
krispy kreme is kreme kreme kreme kreme kreme kreme kreme
get a out sized bag packs 20 percent of a m s
internet starbucks launches starbucks starbucks clothing store
amazon s amazon to a delivery
<unk> 5 5 to
the 18 day of
khloé kardashian s her khloé her her her her her
the best spot for the weirdest camera videos
the funniest craziest cats you hair in your red cats
the dog dog dog dog dog dog with


In [35]:
test_data_dir = '../data/result'
if not os.path.exists(test_data_dir):
    os.mkdir(test_data_dir)
else:
    shutil.rmtree(test_data_dir)
    os.mkdir(test_data_dir)

for i in range(len(test_results)):
    with open(os.path.join(test_data_dir, str(i+1)+'.txt'), 'w') as f:
        f.write(test_results[i])

In [20]:
# to check the unk and pad token in the 
display(content.vocab)
display(title.vocab)

<torchtext.vocab.Vocab at 0x7f92fbe29cc0>

<torchtext.vocab.Vocab at 0x7f92fbe29cf8>

In [21]:
display(len(title.vocab))  # the len method will drop the eos and sos token in the sentneces

50002

In [22]:
# load the pretrainde model here to initialize the embedding matrix here.
embedding_matrix = np.zeros((len(content.vocab), 100))
word_to_vec_path = '../data/glove.6B.100d.txt'
def get_eng_vec(path= word_to_vec_path):
    word_to_vec = dict()
    with open(path, 'r') as f:
        for line in f:
            line=line.split(' ')
            word_to_vec[line[0]]= [float(f) for f in line[1:]]
    return word_to_vec

word_to_vec = get_eng_vec()

In [23]:
embedding_matrix = np.random.randn(len(title.vocab), 100)
for index in range(embedding_matrix.shape[0]):
    unknow_words = 0
    word = title.vocab.itos[index]
    try: # try to find the word in the word_to_vec:
        vector = word_to_vec[word]
        embedding_matrix[index] = vector
    except KeyError:
        unknow_words += 1
        pass
print("find unknow {} words in word2vec".format(unknow_words))

find unknow 0 words in word2vec


In [24]:
display(title.vocab.stoi['to'])
vector = word_to_vec['to']
display(vector)
display(embedding_matrix[4,])

4

[-0.1897,
 0.050024,
 0.19084,
 -0.049184,
 -0.089737,
 0.21006,
 -0.54952,
 0.098377,
 -0.20135,
 0.34241,
 -0.092677,
 0.161,
 -0.13268,
 -0.2816,
 0.18737,
 -0.42959,
 0.96039,
 0.13972,
 -1.0781,
 0.40518,
 0.50539,
 -0.55064,
 0.4844,
 0.38044,
 -0.0029055,
 -0.34942,
 -0.099696,
 -0.78368,
 1.0363,
 -0.2314,
 -0.47121,
 0.57126,
 -0.21454,
 0.35958,
 -0.48319,
 1.0875,
 0.28524,
 0.12447,
 -0.039248,
 -0.076732,
 -0.76343,
 -0.32409,
 -0.5749,
 -1.0893,
 -0.41811,
 0.4512,
 0.12112,
 -0.51367,
 -0.13349,
 -1.1378,
 -0.28768,
 0.16774,
 0.55804,
 1.5387,
 0.018859,
 -2.9721,
 -0.24216,
 -0.92495,
 2.1992,
 0.28234,
 -0.3478,
 0.51621,
 -0.43387,
 0.36852,
 0.74573,
 0.072102,
 0.27931,
 0.92569,
 -0.050336,
 -0.85856,
 -0.1358,
 -0.92551,
 -0.33991,
 -1.0394,
 -0.067203,
 -0.21379,
 -0.4769,
 0.21377,
 -0.84008,
 0.052536,
 0.59298,
 0.29604,
 -0.67644,
 0.13916,
 -1.5504,
 -0.20765,
 0.7222,
 0.52056,
 -0.076221,
 -0.15194,
 -0.13134,
 0.058617,
 -0.31869,
 -0.61419,
 -0.62393,
 

array([-1.8970e-01,  5.0024e-02,  1.9084e-01, -4.9184e-02, -8.9737e-02,
        2.1006e-01, -5.4952e-01,  9.8377e-02, -2.0135e-01,  3.4241e-01,
       -9.2677e-02,  1.6100e-01, -1.3268e-01, -2.8160e-01,  1.8737e-01,
       -4.2959e-01,  9.6039e-01,  1.3972e-01, -1.0781e+00,  4.0518e-01,
        5.0539e-01, -5.5064e-01,  4.8440e-01,  3.8044e-01, -2.9055e-03,
       -3.4942e-01, -9.9696e-02, -7.8368e-01,  1.0363e+00, -2.3140e-01,
       -4.7121e-01,  5.7126e-01, -2.1454e-01,  3.5958e-01, -4.8319e-01,
        1.0875e+00,  2.8524e-01,  1.2447e-01, -3.9248e-02, -7.6732e-02,
       -7.6343e-01, -3.2409e-01, -5.7490e-01, -1.0893e+00, -4.1811e-01,
        4.5120e-01,  1.2112e-01, -5.1367e-01, -1.3349e-01, -1.1378e+00,
       -2.8768e-01,  1.6774e-01,  5.5804e-01,  1.5387e+00,  1.8859e-02,
       -2.9721e+00, -2.4216e-01, -9.2495e-01,  2.1992e+00,  2.8234e-01,
       -3.4780e-01,  5.1621e-01, -4.3387e-01,  3.6852e-01,  7.4573e-01,
        7.2102e-02,  2.7931e-01,  9.2569e-01, -5.0336e-02, -8.58

In [25]:
np.save('../data/decoder_embedding_50000_100.npy', embedding_matrix)

In [26]:
tensor_embedding = torch.from_numpy(embedding_matrix)