In [1]:
import sys
import numpy as np

import textwrap
wrapper = textwrap.TextWrapper(width=70)

import trax
from trax import layers as tl
from trax.fastmath import numpy as jnp

# to print the entire np array
np.set_printoptions(threshold=sys.maxsize)

In [45]:

# Get the data - download the dataset if no data_dir is specified.
# so we have the data already in 'data/' for you

# Importing CNN/DailyMail articles dataset
train_stream_fnction = trax.data.TFDS('cnn_dailymail',
                                 data_dir='news_data/',
                                 keys=('article', 'highlights'),
                                 train=True)

# This should be much faster as the data is downloaded already.
eval_stream_fnction = trax.data.TFDS('cnn_dailymail',
                                data_dir='news_data/',
                                keys=('article', 'highlights'),
                                train=False)

#### Create tokenize and detokenize functions

In [46]:
# Now need create helper functions to tokenize and detokenize data. Tokenise converts a text sentence to its
# corresponding token list (i.e. list of indices). Also converts words to subwords.
# similarly we need to have detokenize function to reconvert the tokens to its sentence

def tokenize(input_str,EOS=1):
    """ convert input string to a feature dictionary"""
#     trax.data.tokenize method takes streams and returns streams, we user iter to have one elment stream
    input_sting=next(trax.data.tokenize(iter([input_str]),
                                       vocab_dir='vocab_dir/',
                                       vocab_file='summarize32.subword.subwords'))
#     put EOS at the end of sentence
    return list(input_string)+[EOS]

def detokenize(input_integers):
    """convert input intergers to string"""
    string_converted=trax.data.detokenize(input_integers,
                                        vocab_dir='vocab_dir/',
                                        vocab_file='summaize32k.subword.subwords')
    
    return wrapper.fill(string_converted)


In [43]:
# Language model and preprocessing
# language models only predicts next work,we concatenate inputs with target and seperate them
# with a seperator and concatenate them. Further padding masks are used 0s and 1s in input and targets 
# respectively. So the focus is model to pay attention on summary.

In [49]:
# mask tokens
# SEP=0 #Padding or separator
# EOS=1 #end of token sentence

# # Now lets concatenate input tokens and targets using 0 as seperator
def preprocess(stream):
    """get the data stream and seperate with 0, stream data comming with articles and summary"""
    for (article,summary) in stream:
        combine=np.array(list(article)+[EOS,SEP]+list(summary)+[EOS])
        mask=[0]*(len(list(article))+2)+[1]*(len(list(summary))+1)
        yield combine,combine,np.array(mask)

# # make data pipeline as follows
input_pipeline=trax.data.Serial(
#     first tokennize
    trax.data.Tokenize(vocab_dir='vocab_dir/',
                        vocab_file='summarize32k.subword.subwords'),
#     now use the above function preprocess
    preprocess,
#     need to filter out the strings longer than 2018
    trax.data.FilterByLength(2048)
)

# # Apply above pipeline to both train and evaluation data
train_stream=input_pipeline(train_stream_fnction())
eval_stream=input_pipeline(eval_stream_fnction())

# get one by one
train_input,train_target,train_mask=next(train_stream)
# train and target shoud be same language model
assert sum((train_input-train_target)**2)==0


In [50]:
# prints mask, 0s on article, 1s on summary
print(f'Single example mask:\n\n {train_mask}')

Single example mask:

 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0