### Imports

In [None]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import pickle
from itertools import chain

import gc
import warnings
warnings.simplefilter('ignore')

from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

import tensorflow as tf
from keras.models import Model, Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Embedding, Dense, Input, RepeatVector, TimeDistributed, concatenate, add, Dropout
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

%config InteractiveShell.ast_node_interactivity = 'all'

### Necessary for tf to grab as much memory as is available, not as much as it wants

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

### Helper Functions

In [None]:
def extract_embeddings(w2v_model_path, embeddings_path):
    """Imports the the previously trained word2vec
    model and procedes to extract and save to disk the embeddings 
    weights/vectors as a numpy matrix. """
    
    model = Word2Vec.load(w2v_model_path)   
    weights = model.wv.syn0
    np.save(open(embeddings_path, 'wb'), weights)


def update_dicts(w2v_model_path):
    """Uses a word2vec model's vocabulary to
    construct word-index and index-word mappings, it also adds
    special tokens as the lowerest indeces"""
    
    model = Word2Vec.load(w2v_model_path)
   
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx['<unk>'] = len(word2idx)
        
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    
    return word2idx, idx2word


def create_embeddings_layer(embeddings_path):
    """Creates and embeddings layer fromt the
    weights matrix of a word2vec model"""
    
    weights = np.load(open(embeddings_path, 'rb'))
   
    layer = Embedding(input_dim=weights.shape[0],
                      output_dim=weights.shape[1],
                      weights=[weights], trainable=False)
    print("(vocab_size, output_dim) : ", weights.shape)
    return layer

from collections import Counter

def transform_input_text(texts):
    """Transforms the input text so that the words are arrays padded
    to the maximum allowed input length"""
    
    temp = []
    for line in texts:
        x = []
        for word in line.lower().split(' '):
            wid = 1
            if word in  input_word2idx:
                wid =  input_word2idx[word]
            x.append(wid)
            if len(x) >=  max_input_seq_length:
                break
        temp.append(x)
    temp = pad_sequences(temp, maxlen= max_input_seq_length)

    print(temp.shape)
    return temp

def split_target_text(texts):
    """Splits the target text and adds the start
    and end tokens"""
    
    temp = []
    for line in texts:
        x = []
        line2 = 'START ' + line.lower() + ' END'
        for word in line2.split(' '):
            x.append(word)
            if len(x)+1 >=  max_target_seq_length:
                x.append('END')
                break
        temp.append(x)
    return temp

#defaults for params

MAX_INPUT_SEQ_LENGTH = 500
MAX_TARGET_SEQ_LENGTH = 50
MAX_INPUT_VOCAB_SIZE = 5000
MAX_TARGET_VOCAB_SIZE = 2000

def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
    """Creates the dictionaries for the word to id lookup and vice versa,
    calculates the maximum input and output sequence length and the 
    number of tokens in the dictionary"""
    
    if input_seq_max_length is None:
        input_seq_max_length = MAX_INPUT_SEQ_LENGTH
    if target_seq_max_length is None:
        target_seq_max_length = MAX_TARGET_SEQ_LENGTH
    input_counter = Counter()
    target_counter = Counter()
    max_input_seq_length = 0
    max_target_seq_length = 0

    for line in X:
        text = [word.lower() for word in line.split(' ')]
        seq_length = len(text)
        if seq_length > input_seq_max_length:
            text = text[0:input_seq_max_length]
            seq_length = len(text)
        for word in text:
            input_counter[word] += 1
        max_input_seq_length = max(max_input_seq_length, seq_length)

    for line in Y:
        line2 = 'START ' + line.lower() + ' END'
        text = [word for word in line2.split(' ')]
        seq_length = len(text)
        if seq_length > target_seq_max_length:
            text = text[0:target_seq_max_length]
            seq_length = len(text)
        for word in text:
            target_counter[word] += 1
            max_target_seq_length = max(max_target_seq_length, seq_length)

    input_word2idx = dict()
    for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
        input_word2idx[word[0]] = idx + 2
    input_word2idx['PAD'] = 0
    input_word2idx['UNK'] = 1
    input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])

    target_word2idx = dict()
    for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
        target_word2idx[word[0]] = idx + 1
    target_word2idx['UNK'] = 0

    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
    
    num_input_tokens = len(input_word2idx)
    num_target_tokens = len(target_word2idx)

    config = dict()
    config['input_word2idx'] = input_word2idx
    config['input_idx2word'] = input_idx2word
    config['target_word2idx'] = target_word2idx
    config['target_idx2word'] = target_idx2word
    config['num_input_tokens'] = num_input_tokens
    config['num_target_tokens'] = num_target_tokens
    config['max_input_seq_length'] = max_input_seq_length
    config['max_target_seq_length'] = max_target_seq_length

    return config

# Get the data

# preprocess data

In [None]:
with open('titlesAbstracts_AT.pkl', 'rb') as fh:
    titles = pickle.load(fh)

with open('abstractsCorpus_ATsigns.pkl', 'rb') as fh:
    text = pickle.load(fh)

textConcat = list()

for each in text.values():
    tmp = ' '.join(list(chain.from_iterable(each)))
    textConcat.append(tmp)

X = textConcat[:10000]
Y = list(titles.values())[:10000]

Xfull = textConcat
Yfull = list(titles.values())

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

### get the word id dict and vice versa; also the params

In [None]:
conf = fit_text(X, Y)

# set up the model

## Set up a precomputed embeddings layer; glove based

In [None]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

texts = X

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

embeddings_index = {}
with open('glove.6B.100d.txt', 'r', encoding="utf8") as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


print(f'Found {len(embeddings_index)} word vectors.')

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Done')

### Params:

In [None]:
HIDDEN_UNITS = 100
MAX_DECODER_SEQ_LENGTH = 4

input_word2idx = conf['input_word2idx']
input_idx2word = conf['input_idx2word']
target_word2idx = conf['target_word2idx'] 
target_idx2word = conf['target_idx2word']
num_input_tokens = conf['num_input_tokens']
num_target_tokens = conf['num_target_tokens']
max_input_seq_length = conf['max_input_seq_length']
max_target_seq_length = conf['max_target_seq_length']

# Model:

In [None]:
inputs1 = Input(shape=( max_input_seq_length,))
article1 = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_input_seq_length,
                    trainable=True)(inputs1) #premade weights, fine tune
article2 = Dropout(0.3)(article1)

# summary input model
inputs2 = Input(shape=(min( num_target_tokens, MAX_DECODER_SEQ_LENGTH), ))
summ1 = Embedding( num_target_tokens, EMBEDDING_DIM)(inputs2)
summ2 = Dropout(0.3)(summ1)
summ3 = LSTM(EMBEDDING_DIM)(summ2)
summ4 = RepeatVector( max_input_seq_length)(summ3)

# decoder model
decoder1 = concatenate([article2, summ4])
decoder2 = LSTM(EMBEDDING_DIM)(decoder1)
outputs = Dense( num_target_tokens, activation='softmax')(decoder2)
# tie it together [article, summary] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

In [None]:
def generate_batch(x_samples, y_samples, batch_size):
    """Generates the batches from the training data given the 
    batch size necessary; pads sequences if required"""
    
    encoder_input_data_batch = []
    decoder_input_data_batch = []
    decoder_target_data_batch = []
    line_idx = 0
    while True:
        for recordIdx in range(0, len(x_samples)):
            target_words = y_samples[recordIdx]
            x = x_samples[recordIdx]
            decoder_input_line = []

            for idx in range(0, len(target_words)-1):
                w2idx = 0  # default [UNK]
                w = target_words[idx]
                if w in  target_word2idx:
                    w2idx =  target_word2idx[w]
                decoder_input_line = decoder_input_line + [w2idx]
                decoder_target_label = np.zeros( num_target_tokens)
                w2idx_next = 0
                if target_words[idx+1] in  target_word2idx:
                    w2idx_next =  target_word2idx[target_words[idx+1]]
                if w2idx_next != 0:
                    decoder_target_label[w2idx_next] = 1

                decoder_input_data_batch.append(decoder_input_line)
                encoder_input_data_batch.append(x)
                decoder_target_data_batch.append(decoder_target_label)

                line_idx += 1
                if line_idx >= batch_size:
                    yield [pad_sequences(encoder_input_data_batch,  max_input_seq_length),
                           pad_sequences(decoder_input_data_batch,
                                         min( num_target_tokens, MAX_DECODER_SEQ_LENGTH))], np.array(decoder_target_data_batch)
                    line_idx = 0
                    encoder_input_data_batch = []
                    decoder_input_data_batch = []
                    decoder_target_data_batch = []   

In [None]:
def fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=256):
    """Trains a model and returns a history so that the training
    statistics could be monitored as well"""    
    
    checkpoint = ModelCheckpoint('model4_checkpoint.h5')
    
    Ytrain =  split_target_text(Ytrain)
    Ytest =  split_target_text(Ytest)

    Xtrain =  transform_input_text(Xtrain)
    Xtest =  transform_input_text(Xtest)

    train_gen =  generate_batch(Xtrain, Ytrain, batch_size)
    test_gen =  generate_batch(Xtest, Ytest, batch_size)

    total_training_samples = sum([len(target_text)-1 for target_text in Ytrain])
    total_testing_samples = sum([len(target_text)-1 for target_text in Ytest])
    train_num_batches = total_training_samples // batch_size
    test_num_batches = total_testing_samples // batch_size

    history =  model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                       epochs=epochs,
                                       verbose=1, validation_data=test_gen, validation_steps=test_num_batches,
                                       callbacks=[checkpoint])
    
    model.save('model4')
    return history

In [None]:
def summarize(input_text):
    """Creates the summary from the input sequence;
    samples from the decoder until either the end token is reached
    or the maximum output sequence length is reached"""
    
    input_seq = []
    input_wids = []
    for word in input_text.lower().split(' '):
        idx = 1  # default [UNK]
        if word in  input_word2idx:
            idx =  input_word2idx[word]
        input_wids.append(idx)
    input_seq.append(input_wids)
    input_seq = pad_sequences(input_seq,  max_input_seq_length)
    start_token =  target_word2idx['START']
    wid_list = [start_token]
    sum_input_seq = pad_sequences([wid_list], min( num_target_tokens, MAX_DECODER_SEQ_LENGTH))
    terminated = False

    target_text = ''

    while not terminated:
        output_tokens =  model.predict([input_seq, sum_input_seq])
        sample_token_idx = np.argmax(output_tokens[0, :])
        sample_word =  target_idx2word[sample_token_idx]
        wid_list = wid_list + [sample_token_idx]

        if sample_word != 'START' and sample_word != 'END':
            target_text += ' ' + sample_word
        if sample_word == 'END' or len(wid_list) >=  max_target_seq_length:
            terminated = True
        else:
            sum_input_seq = pad_sequences([wid_list], min( num_target_tokens, MAX_DECODER_SEQ_LENGTH))
    return target_text.strip()

# Train

In [None]:
history = fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=256)

accuracy on the validation set reached in our case after 20 epochs was 0.2430

# Predict

In [None]:
from keras.models import load_model
model = load_model(r'model') #if exists; point to your path if different
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Get data

In [None]:
import pickle
with open('titlesAbstracts_AT.pkl', 'rb') as fh:
    titles = pickle.load(fh)

with open('abstractsCorpus_ATsigns.pkl', 'rb') as fh:
    text = pickle.load(fh)

textConcat = list()

from itertools import chain
for each in text.values():
    tmp = ' '.join(list(chain.from_iterable(each)))
    textConcat.append(tmp)

X = textConcat[:10000]
Y = list(titles.values())[:10000]

Xfull = textConcat
Yfull = list(titles.values())

### Get params

In [None]:
conf = fit_text(X, Y)

HIDDEN_UNITS = 100

MAX_DECODER_SEQ_LENGTH = 4

input_word2idx = conf['input_word2idx']
input_idx2word = conf['input_idx2word']
target_word2idx = conf['target_word2idx'] 
target_idx2word = conf['target_idx2word']
num_input_tokens = conf['num_input_tokens']
num_target_tokens = conf['num_target_tokens']
max_input_seq_length = conf['max_input_seq_length']
max_target_seq_length = conf['max_target_seq_length']

### Get random article to predict

In [None]:
from random import randint
c = randint(10000, len(Xfull))

textPredict = Xfull[c]
labelPredict = Yfull[c]

summarize(textPredict)
labelPredict

# Evaluate

In [None]:
from sumeval.metrics.rouge import RougeCalculator
rouge = RougeCalculator(stopwords=True, lang="en")

In [None]:
rouge1 = dict()

for _ in range(50):
    from random import randint
    c = randint(10000, len(Xfull))

    textPredict = Xfull[c]
    labelPredict = Yfull[c]

    generated = summarize(textPredict)
    reference = labelPredict

    score = rouge.rouge_n(
                summary=generated,
                references=reference,
                n=1)
    
    rouge1[score] = (generated, reference)

In [None]:
for s in sorted(rouge1.keys(), reverse=True):
    
    gen, orig = rouge1[s]
    print(s)
    print(f'Generated headline:{gen}')
    print(f'Original headline:{orig}')

### Rouge1 avg:

In [None]:
np.mean(list(rouge1.keys()))

# Rouge1 example on ideal:

In [None]:
rouge.rouge_n(summary='I would like an apple.', references='I would like an apple.', n=1)
rouge.rouge_n(summary='I would like to eat an apple.', references='I feel like having an apple.', n=1)