In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
from pickle import load
import pickle


import copy
import tensorflow as tf

path_stories = '../input/text-sum-load-and-preprocess-dataset/cnn_dataset.pkl'
np.random.seed(9001)
# Any results you write to the current directory are saved as output.

['glove840b300dtxt', 'glove6b100dtxt', 'russian-glove', 'text-sum-load-and-preprocess-dataset']


In [2]:
stories = load(open(path_stories, 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [3]:
print(stories[0])

{'story': ['london england reuters harry potter star daniel radcliffe gain access to a reported million million fortune a he turn on monday but he insists the money wont cast a spell on him', 'daniel radcliffe a harry potter in harry potter and the order of the phoenix', 'to the disappointment of gossip columnist around the world the young actor say he ha no plan to fritter his cash away on fast car drink and celebrity party', 'i dont plan to be one of those people who a soon a they turn suddenly buy themselves a massive sport car collection or something similar he told an australian interviewer earlier this month i dont think ill be particularly extravagant', 'the thing i like buying are thing that cost about pound book and cd and dvd', 'at radcliffe will be able to gamble in a casino buy a drink in a pub or see the horror film hostel part ii currently six place below his number one movie on the uk box office chart', 'detail of how hell mark his landmark birthday are under wrap his ag

In [4]:
def load_data(path):
    stories = load(open(path, 'rb'))
    text_sentences = ''
    summary_sentences = ''
    stories = stories[:1500]
    for i in range(len(stories)):
        text_sentences += ' '.join(stories[i]['story'])
        summary_sentences += ' '.join(stories[i]['highlights'])
        if i != len(stories)-1:
            text_sentences += '\n'
            summary_sentences += '\n'
    return text_sentences, summary_sentences

In [5]:
np.random.seed(9001)
EMBEDDING_DIMENSION = 100 # Available dimensions for 6B data is 50, 100, 200, 300

glove_weights_file_path = '../input/glove6b100dtxt/glove.6B.100d.txt'

CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

# считывать файл ембедингов создавать три переменные и возвращать
def create_lookup_tables():
    
    weights = list()
    # starts with the special tokens
    word_to_id = copy.copy(CODES)
    
    with open(glove_weights_file_path, 'r') as file: 
        for index, line in enumerate(file): 
            values = line.split() # word and weights separated by space
            word = values[0] # word is first symbol on each line
            word_weights = np.asarray(values[1:], dtype=np.float32) # remainder of line is weights for word
            # PAD, EOS, UNK, GO is our zeroth, first, second, third indices so shift by four
            word_to_id[word] = index + 4 
            weights.append(word_weights)
            
    # insert the PAD, EOS, UNK, GO weights at indices 0, 1, 2, 3 respectively 
    weights.insert(0, np.random.randn(EMBEDDING_DIMENSION)) 
    weights.insert(1, np.random.randn(EMBEDDING_DIMENSION))
    weights.insert(2, np.random.randn(EMBEDDING_DIMENSION)) 
    weights.insert(3, np.random.randn(EMBEDDING_DIMENSION)) 
    
    weights = np.asarray(weights, dtype=np.float32)
    id_to_word = {v_i: v for v, v_i in word_to_id.items()}
    
    
    return word_to_id, id_to_word, weights

In [6]:
# word_to_id в оба параметра
def text_to_ids(source_text, target_text, word_to_id):
    """
        1st, 2nd args: raw string text to be converted
        3rd, 4th args: lookup tables for 1st and 2nd args respectively
    
        return: A tuple of lists (source_id_text, target_id_text) converted
    """
    # empty list of converted sentences
    source_text_id = []
    target_text_id = []
    
    # make a list of sentences (extraction)
    source_sentences = source_text.split("\n")
    target_sentences = target_text.split("\n")
    
    # iterating through each sentences (# of sentences in source&target is the same)
    for i in range(len(source_sentences)):
        # extract sentences one by one
        source_sentence = source_sentences[i]
        target_sentence = target_sentences[i]
        # make a list of tokens/words (extraction) from the chosen sentence
        source_tokens = source_sentence.split(" ")
        target_tokens = target_sentence.split(" ")
        
        # empty list of converted words to index in the chosen sentence
        source_token_id = []
        target_token_id = []
        # вместо word to idx
        for index, token in enumerate(source_tokens):
            if (token != ""):
                source_token_id.append(word_to_id.get(token, word_to_id['<UNK>']))
        
        for index, token in enumerate(target_tokens):
            if (token != ""):
                target_token_id.append(word_to_id.get(token, word_to_id['<UNK>']))
                
        # put <EOS> token at the end of the chosen target sentence
        # this token suggests when to stop creating a sequence
        target_token_id.append(word_to_id['<EOS>'])
            
        # add each converted sentences in the final list
        source_text_id.append(source_token_id)
        target_text_id.append(target_token_id)
    
    return source_text_id, target_text_id

In [9]:
def preprocess_and_save_data(path):
    # Preprocess
    
    # load original data (text, summary)
    source_text_words, target_text_words = load_data(path)
    word_to_id, id_to_word, glove_weights = create_lookup_tables()
    
    # create list of sentences whose words are represented in index
    source_text_int, target_text_int = text_to_ids(source_text_words, target_text_words, word_to_id)

    # Save data for later use
    pickle.dump((
        (source_text_int, target_text_int),
        word_to_id,
        id_to_word,
        glove_weights), open('preprocess.p', 'wb'))

In [10]:
preprocess_and_save_data(path_stories)

In [11]:
def load_preprocess():
    with open('preprocess.p', mode='rb') as in_file:
        return pickle.load(in_file)

In [12]:
(source_int_text, target_int_text), word_to_id, id_to_word, glove_weights = load_preprocess()

In [13]:
# testing preprocessing
print(source_int_text[0])
print(target_int_text[0])
for k, v in word_to_id.items():
    print(k, v)
    if v == 10:
        break

[520, 567, 10855, 3219, 7658, 757, 2592, 20053, 1889, 1126, 8, 11, 297, 97, 97, 5175, 11, 22, 894, 17, 187, 38, 22, 4975, 4, 312, 58548, 1788, 11, 6256, 17, 107, 2592, 20053, 11, 3219, 7658, 10, 3219, 7658, 9, 4, 464, 7, 4, 3612, 8, 4, 6716, 7, 13374, 5739, 208, 4, 89, 4, 465, 2022, 207, 22, 8361, 88, 398, 8, 102694, 30, 1488, 424, 17, 1609, 573, 4607, 9, 5174, 169, 45, 46772, 398, 8, 34, 52, 7, 159, 73, 42, 11, 724, 11, 43, 894, 3713, 991, 1004, 11, 1975, 1939, 573, 1724, 50, 649, 797, 22, 158, 33, 807, 19605, 354, 41, 233, 45, 46772, 273, 3122, 34, 1117, 20227, 4, 877, 45, 121, 2197, 36, 877, 16, 776, 63, 3609, 543, 9, 3541, 9, 4184, 26, 20053, 47, 34, 671, 8, 9446, 10, 11, 5395, 991, 11, 4607, 10, 11, 10453, 50, 257, 4, 5992, 323, 20737, 157, 823, 843, 232, 245, 1272, 30, 227, 52, 1009, 17, 4, 2050, 1934, 287, 3103, 4606, 7, 201, 5617, 803, 30, 4820, 4162, 36, 128, 9011, 30, 1971, 9, 15821, 44, 88, 1324, 17, 30, 398, 3122, 3940, 37, 81, 2602, 7, 169, 22, 20, 10, 33, 966, 6203, 2129,

In [14]:
def enc_dec_model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets') 
    
    target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
    max_target_len = tf.reduce_max(target_sequence_length)    
    
    return inputs, targets, target_sequence_length, max_target_len

In [15]:
def hyperparam_inputs():
    lr_rate = tf.placeholder(tf.float32, name='lr_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return lr_rate, keep_prob

In [16]:
# check
def process_decoder_input(target_data, word_to_id, batch_size):
    """
    Preprocess target data for encoding
    :return: Preprocessed target data
    """
    # get '<GO>' id
    
    # may be it should be one-hot vector
    go_id = word_to_id['<GO>']
    
    after_slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    after_concat = tf.concat( [tf.fill([batch_size, 1], go_id), after_slice], 1)
    
    return after_concat

In [17]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   encoding_embedding_size, embedding_weights):
    """
    :return: tuple (RNN output, RNN state)
    """
    # get glove representation of words from rnn_inputs
    embed = tf.nn.embedding_lookup(embedding_weights, rnn_inputs)
    
    stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
    
    #try to set sequence_length param to sentane length
    # as time_major == False, inputs must be a Tensor of shape: [batch_size, max_time, ...]
    
    # try to change to tf.nn.bidirectional_dynamic_rnn
    outputs, state = tf.nn.dynamic_rnn(stacked_cells, 
                                       embed, 
                                       dtype=tf.float32)
    
    return outputs, state

In [69]:
def decoding_layer_train(enc_outputs, encoder_state, rnn_size, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
    """
    Create a training process in decoding layer 
    :return: BasicDecoderOutput containing training logits and sample_id
    """
    
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, 
                                             output_keep_prob=keep_prob)
    ## Attention layer
    # chenge
    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units = rnn_size, # depth of query mechanism
            memory = enc_outputs, # hidden states to attend (output of RNN)
            normalize=True, # normalize energy term
            name='BahdanauAttention')
    
    attention_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, 
                                                         attention_mechanism)
    
    decoder_initial_state = attention_cell.zero_state(batch_size, tf.float32).clone(
          cell_state=encoder_state)
    
    # for only input layer
    helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, 
                                               target_sequence_length)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(attention_cell, 
                                              helper, 
                                              decoder_initial_state, 
                                              output_layer)

    # unrolling the decoder layer
    # returns final_outputs, final_state, final_sequence_lengths
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_summary_length)
    return outputs

In [70]:
def decoding_layer_infer(enc_outputs, encoder_state, rnn_size, dec_cell, 
                         embedding_weights, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         output_layer, batch_size, keep_prob, beam_width):
    """
    Create a inference process in decoding layer 
    :return: BasicDecoderOutput containing inference logits and sample_id
    """
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, 
                                             output_keep_prob=keep_prob)
    
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs, 
                                                          multiplier=beam_width)
    
    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(encoder_state, 
                                                             multiplier=beam_width)
    
    
    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units = rnn_size, # depth of query mechanism
            memory = tiled_encoder_outputs, # hidden states to attend (output of RNN)
            normalize=True, # normalize energy term
            name='BahdanauAttention')
    
    attention_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, 
                                                         attention_mechanism, 
                                                         attention_layer_size=rnn_size)
    
    decoder_initial_state = attention_cell.zero_state(dtype=tf.float32, 
                                                      batch_size=batch_size * beam_width)
    
    decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)
    
    decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=attention_cell,
                                                   embedding=embedding_weights,
                                                   start_tokens=tf.fill([batch_size], start_of_sequence_id), 
                                                   end_token=end_of_sequence_id,
                                                   initial_state=decoder_initial_state,
                                                   beam_width=beam_width,
                                                   output_layer=output_layer,
                                                   length_penalty_weight=0.0)
    
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_target_sequence_length)
    return outputs

In [71]:
def decoding_layer(dec_input, enc_outputs, encoder_state, target_sequence_length,
                   max_target_sequence_length, vocab_size, rnn_size,num_layers,
                   word_to_id, batch_size, keep_prob, decoding_embedding_size,
                   beam_width, embedding_weights):
    """
    Create decoding layer
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    
    dec_embed_input = tf.nn.embedding_lookup(embedding_weights, dec_input)
    cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
    
    with tf.variable_scope("decode"):
        output_layer = tf.layers.Dense(vocab_size)
        train_output = decoding_layer_train(enc_outputs,
                                            encoder_state,
                                            rnn_size,
                                            cells,
                                            dec_embed_input, 
                                            target_sequence_length, 
                                            max_target_sequence_length, 
                                            output_layer, 
                                            keep_prob)
        
    with tf.variable_scope("decode", reuse=True):
        infer_output = decoding_layer_infer(enc_outputs,
                                            encoder_state,
                                            rnn_size,
                                            cells, 
                                            embedding_weights, 
                                            word_to_id['<GO>'], 
                                            word_to_id['<EOS>'], 
                                            max_target_sequence_length, 
                                            output_layer,
                                            batch_size,
                                            keep_prob,
                                            beam_width)

    return (train_output, infer_output)

In [72]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  target_sequence_length, max_target_sentence_length,
                  vocab_size, enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, word_to_id, beam_width, weights):
    """
    Build the Sequence-to-Sequence model
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    
    glove_weights_initializer = tf.constant_initializer(weights)
    embedding_weights = tf.get_variable(name='embedding_weights', 
                                        shape=(weights.shape[0], 
                                        encoding_embedding_size), 
                                        initializer=glove_weights_initializer,
                                        trainable=False)
    
    enc_outputs, enc_states = encoding_layer(input_data, 
                                             rnn_size, 
                                             num_layers, 
                                             keep_prob, 
                                             enc_embedding_size, 
                                             embedding_weights)
    
    dec_input = process_decoder_input(target_data, 
                                      word_to_id, 
                                      batch_size)
    
    train_output, infer_output = decoding_layer(dec_input,
                                                enc_outputs,
                                                enc_states, 
                                                target_sequence_length, 
                                                max_target_sentence_length,
                                                vocab_size,
                                                rnn_size,
                                                num_layers,
                                                word_to_id,
                                                batch_size,
                                                keep_prob,
                                                dec_embedding_size, 
                                                beam_width,
                                                embedding_weights)
    
    return train_output, infer_output

In [57]:
display_step = 150

epochs = 5
batch_size = 8

rnn_size = 64
num_layers = 3
#num_layers = 1
beam_width = 5

# as we use glove 300d
encoding_embedding_size = 300
decoding_embedding_size = 300

learning_rate = 0.001
keep_probability = 0.5

In [None]:
save_path = 'checkpoints/dev'
(source_int_text, target_int_text), word_to_id, id_to_word, glove_weights = load_preprocess()
max_target_sentence_length = max([len(sentence) for sentence in target_int_text])

vocab_size = len(word_to_id)

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, target_sequence_length, max_target_sequence_length = enc_dec_model_inputs()
    lr, keep_prob = hyperparam_inputs()
    
    train_logits, inference_output = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   vocab_size,
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   word_to_id, 
                                                   beam_width,
                                                   glove_weights)
    
    
    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.no_op()
    inference_sample_id = inference_output.predicted_ids
    
    # https://www.tensorflow.org/api_docs/python/tf/sequence_mask
    # - Returns a mask tensor representing the first N positions of each cell.
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function - weighted softmax cross entropy
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [None]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]


def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths

In [None]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

# Split data to training and validation sets
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]
valid_source = source_int_text[:batch_size]
valid_target = target_int_text[:batch_size]
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_vocab_to_int['<PAD>'],
                                                                                                             target_vocab_to_int['<PAD>']))                                                                                                  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_vocab_to_int['<PAD>'],
                            target_vocab_to_int['<PAD>'])):

            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 keep_prob: keep_probability})
            
            if batch_i % display_step == 0 and batch_i > 0:
                batch_train_logits = sess.run(
                    inference_logits,
                    {input_data: source_batch,
                     target_sequence_length: targets_lengths,
                     keep_prob: 1.0})

                batch_valid_logits = sess.run(
                    inference_logits,
                    {input_data: valid_sources_batch,
                     target_sequence_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)

                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')

In [None]:
def save_params(params):
    with open('params.p', 'wb') as out_file:
        pickle.dump(params, out_file)


def load_params():
    with open('params.p', mode='rb') as in_file:
        return pickle.load(in_file)

In [None]:
# Save parameters for checkpoint
save_params(save_path)

In [None]:
#import problem_unittests as tests

_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = load_preprocess()
load_path = load_params()

In [None]:
def sentence_to_seq(sentence, vocab_to_int):
    results = []
    for word in sentence.split(" "):
        if word in vocab_to_int:
            results.append(vocab_to_int[word])
        else:
            results.append(vocab_to_int['<UNK>'])
            
    return results

In [None]:
translate_sentence = ' '.join(stories[24001]['story'])

translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)
                                
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

    translate_logits = sess.run(logits, {input_data: [translate_sentence]*batch_size,
                                         target_sequence_length: [len(translate_sentence)*2]*batch_size,
                                         keep_prob: 1.0})[0]
print('Input')
#print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\nPrediction')
#print('  Word Ids:      {}'.format([i for i in translate_logits]))
print('  Summary: {}'.format(" ".join([target_int_to_vocab[i] for i in translate_logits])))