# Title Generation Experiment

For title generation, we have experimented on top of the following works: 
* https://arxiv.org/pdf/1512.01712.pdf
* https://github.com/NainiShah/News-Headline-Generation

Though we could not obtain satisfactory results due to training for less iterations (memory constraint).

In [None]:
import random
import codecs
import math
import time
import sys
import subprocess
import os.path
import pickle
import numpy as np
import gensim

import keras.backend as K
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.core import Lambda, Activation
from keras.utils import np_utils
from keras.preprocessing import sequence

from tqdm import tqdm
import sklearn
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from numpy import inf
from operator import itemgetter

In [None]:
import re

In [None]:
!pip install ftfy

In [None]:
import csv
import os
from collections import defaultdict
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from nltk.stem import PorterStemmer
from gensim.models import KeyedVectors
import ftfy
import string
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
seed = 28
random.seed(seed)
np.random.seed(seed)

In [None]:
top_freq_word_to_use = 40000
embedding_dimension = 300
max_len_head = 25
max_len_desc = 50
max_length = max_len_head + max_len_desc
rnn_layers = 4
rnn_size = 128
activation_rnn_size = 16
empty_tag_location = 0
eos_tag_location = 1
unknown_tag_location = 2
learning_rate = 1e-4
min_head_line_gen = 10
dont_repeat_word_in_last = 5

In [None]:
word2vec = []
idx2word = {}
word2idx = {}

word2idx['<empty>'] = empty_tag_location
word2idx['<eos>'] = eos_tag_location
word2idx['<unk>'] = unknown_tag_location
idx2word[empty_tag_location] = '<empty>'
idx2word[eos_tag_location] = '<eos>'
idx2word[unknown_tag_location] = '<unk>'

In [None]:
def read_word_embedding(file_name):
    idx = 3
    temp_word2vec_dict = {}
    
    temp_word2vec_dict['<empty>'] = [float(i) for i in np.random.rand(embedding_dimension, 1)]
    temp_word2vec_dict['<eos>'] = [float(i) for i in np.random.rand(embedding_dimension, 1)]
    temp_word2vec_dict['<unk>'] = [float(i) for i in np.random.rand(embedding_dimension, 1)]
    model = gensim.models.KeyedVectors.load_word2vec_format(file_name, binary = True, limit = 40000)
    V = model.index_to_key
    X = np.zeros((top_freq_word_to_use, model.vector_size))
    for index, word in tqdm(enumerate(V)):
        vector = model[word]
        temp_word2vec_dict[idx] = vector
        word2idx[word] = idx
        idx2word[idx] = word
        idx = idx + 1
            
    return temp_word2vec_dict

In [None]:
temp_word2vec_dict = read_word_embedding('../input/googlenewsvectors/GoogleNews-vectors-negative300.bin')
length_vocab = len(temp_word2vec_dict)
shape = (length_vocab, embedding_dimension)

word2vec = np.random.uniform(low=-1, high=1, size=shape)
for i in range(length_vocab):
    if i in temp_word2vec_dict:
        word2vec[i, :] = temp_word2vec_dict[i]

In [None]:
def simple_context(X, mask):
    
    desc, head = X[:, :max_len_desc, :], X[:, max_len_desc:, :]

    head_activations, head_words = head[:, :, :activation_rnn_size], head[:, :, activation_rnn_size:]
    desc_activations, desc_words = desc[:, :, :activation_rnn_size], desc[:, :, activation_rnn_size:]

    activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2))


    activation_energies = activation_energies + -1e20 * K.expand_dims(1. - K.cast(mask[:, :max_len_desc], 'float32'), 1)


    activation_energies = K.reshape(activation_energies, (-1, max_len_desc))
    activation_weights = K.softmax(activation_energies)
    activation_weights = K.reshape(activation_weights, (-1, max_len_head, max_len_desc))

    desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1))
    return K.concatenate((desc_avg_word, head_words))

In [None]:
def output_shape_simple_context_layer(input_shape):
    return (input_shape[0], max_len_head , 2 * (rnn_size - activation_rnn_size))

In [None]:
def create_model():

        model = Sequential()

        model.add(
                Embedding(
                        40003, embedding_dimension,
                        input_length=max_length,
                        mask_zero=True,
                        name='embedding_layer'
                )
        )

        for i in range(rnn_layers):
            lstm = LSTM(rnn_size, return_sequences=True,
                name='lstm_layer_%d' % (i + 1)
            )

            model.add(lstm)

        model.add(Lambda(simple_context,
                     mask=lambda inputs, mask: mask[:, max_len_desc:],
                     output_shape=output_shape_simple_context_layer,
                     name='simple_context_layer'))

        vocab_size = 40003
        model.add(TimeDistributed(Dense(vocab_size,
                                name='time_distributed_layer')))
        
        model.add(Activation('softmax', name='activation_layer'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        K.set_value(model.optimizer.lr, np.float32(learning_rate))
        print (model.summary())
        return model

In [None]:
model = create_model()

### Tagging

In [None]:
def padding(list_idx, curr_max_length, is_left):
    if len(list_idx) >= curr_max_length:
        return list_idx
    number_of_empty_fill = curr_max_length - len(list_idx)
    if is_left:
        return [empty_tag_location, ] * number_of_empty_fill + list_idx
    else:
        return list_idx + [empty_tag_location, ] * number_of_empty_fill

In [None]:
def headline2idx(list_idx, curr_max_length, is_input):
    if is_input:
        if len(list_idx) >= curr_max_length - 1:
            return list_idx[:curr_max_length - 1]
        else:
            list_idx = list_idx + [eos_tag_location, ]
            return padding(list_idx, curr_max_length - 1, False)
    else:
        if len(list_idx) == curr_max_length:
            list_idx[-1] = eos_tag_location
            return list_idx
        else:
            list_idx = list_idx + [eos_tag_location, ]
            return padding(list_idx, curr_max_length, False)

In [None]:
def desc2idx(list_idx, curr_max_length):
    list_idx.reverse()
    list_idx = padding(list_idx, curr_max_length, True)
    list_idx = list_idx + [eos_tag_location, ]
    return list_idx

In [None]:
def sentence2idx(sentence, is_headline, curr_max_length, is_input=True):
    list_idx = []
    tokens = sentence.split(" ")
    count = 0
    for each_token in tokens:
        if each_token in word2idx:
            list_idx.append(word2idx[each_token])
        else:
            list_idx.append(word2idx['<unk>'])
        count = count + 1
        if count >= curr_max_length:
            break

    if is_headline:
        return headline2idx(list_idx, curr_max_length, is_input)
    else:
        return desc2idx(list_idx, curr_max_length)

In [None]:
def flip_words_randomly(description_headline_data, number_words_to_replace, model):
    if number_words_to_replace <= 0 or model == None:
        return description_headline_data

    assert np.all(description_headline_data[:, max_len_desc] == eos_tag_location)

    batch_size = len(description_headline_data)
    predicted_headline_word_idx = model.predict(description_headline_data, verbose=1, batch_size = batch_size)
    copy_data = description_headline_data.copy()
    for idx in range(batch_size):
        random_flip_pos = sorted(random.sample(range(max_len_desc + 1, max_length), number_words_to_replace))
        for replace_idx in random_flip_pos:
            if (description_headline_data[idx, replace_idx] == empty_tag_location or
            description_headline_data[idx, replace_idx] == eos_tag_location):
                continue

            new_id = replace_idx - (max_len_desc + 1)
            prob_words = predicted_headline_word_idx[idx, new_id]
            word_idx = prob_words.argmax()

            if word_idx == empty_tag_location or word_idx == eos_tag_location:
                continue
            copy_data[idx, replace_idx] = word_idx
    return copy_data

In [None]:
def convert_inputs(descriptions, headlines, number_words_to_replace, model, is_training):
    assert len(descriptions) == len(headlines)

    X, y = [], []
    for each_desc, each_headline in zip(descriptions, headlines):
        input_headline_idx = sentence2idx(each_headline, True, max_len_head, True)
        predicted_headline_idx = sentence2idx(each_headline, True, max_len_head, False)
        desc_idx = sentence2idx(each_desc, False, max_len_desc)
        
        assert len(input_headline_idx) == max_len_head - 1
        assert len(predicted_headline_idx) == max_len_head
        assert len(desc_idx) == max_len_desc + 1

        X.append(desc_idx + input_headline_idx)
        y.append(predicted_headline_idx)
        
    X, y = np.array(X), np.array(y)
    if is_training:

        X = flip_words_randomly(X, number_words_to_replace, model)

        vocab_size = word2vec.shape[0]
        length_of_data = len(headlines)
        Y = np.zeros((length_of_data, max_len_head, vocab_size))
        for i, each_y in enumerate(y):
            Y[i, :, :] = np_utils.to_categorical(each_y, vocab_size)
        assert len(X)==len(Y)
        return X, Y
    else:
        return X,headlines

In [None]:
def shuffle_file(file_name):
    try:
        subprocess.check_output(['shuf',file_name,"--output="+file_name])
        print ("Input file shuffled")
    except:
        print ("shuf command not available!")


In [None]:
def large_file_reading_generator(data):
    while True:
        for each_line in data.items():
            yield each_line

In [None]:
def data_generator(file_name,number_words_to_replace,model,is_training=True):
    with open(file_name,'rb') as file_pointer:
        data = pickle.load(file_pointer)
        headlines_data = data['heads']
        descs_data = data['descs']
    headline_iterator = large_file_reading_generator(headlines_data)
    descs_iterator = large_file_reading_generator(descs_data)
    while True:
        X, y = [], []
        for i in range(128):
            heads_line = next(headline_iterator)
            descs_line = next(descs_iterator)
            heads_line = heads_line[1]
            descs_line = descs_line[1]
            X.append(descs_line)
            y.append(heads_line)
        yield convert_inputs(X, y, number_words_to_replace, model,is_training)

In [None]:
def OHE_to_indexes(y_val):
    list_of_headline = []
    for each_headline in y_val:
        list_of_word_indexes = np.where(np.array(each_headline)==1)[1]
        list_of_headline.append(list(list_of_word_indexes))
    return list_of_headline

In [None]:
def indexes_to_words(list_of_headline):
    list_of_word_headline = []
    for each_headline in list_of_headline:
        each_headline_words = []
        for each_word in each_headline:
            if each_word in (empty_tag_location, eos_tag_location, unknown_tag_location):
                continue
            each_headline_words.append(idx2word[each_word])
        list_of_word_headline.append(each_headline_words)            
    return list_of_word_headline

In [None]:
def blue_score_text(y_actual, y_predicted):
    assert len(y_actual) ==  len(y_predicted)
    no_of_news = len(y_actual)
    blue_score = 0.0
    for i in range(no_of_news):
        reference = y_actual[i]
        hypothesis = y_predicted[i]
        
        weights=(0.25, 0.25, 0.25, 0.25)
        min_len_present = min(len(reference),len(hypothesis))
        if min_len_present==0:
            continue
        if min_len_present<4:
            weights=[1.0/min_len_present,]*min_len_present
            
        blue_score = blue_score + sentence_bleu([reference],hypothesis,weights=weights)
        
    return blue_score/float(no_of_news)

In [None]:
def blue_score_calculator(model, validation_file_name, no_of_validation_sample, validation_step_size):
    number_words_to_replace=0
    temp_gen = data_generator(validation_file_name,number_words_to_replace, model)        
        
    total_blue_score = 0.0            
    blue_batches = 0
    blue_number_of_batches = no_of_validation_sample / validation_step_size
    for X_val, y_val in temp_gen:
        predict_x = model.predict(X_val, batch_size = validation_step_size,verbose = 1) 
        y_predicted = np.argmax(predict_x,axis=1)

        y_predicted_words = indexes_to_words(y_predicted)
        list_of_word_headline = indexes_to_words(OHE_to_indexes(y_val))
        assert len(y_val)==len(list_of_word_headline) 

        total_blue_score = total_blue_score + blue_score_text(list_of_word_headline, y_predicted_words)
            
        blue_batches += 1
        if blue_batches >=  blue_number_of_batches:
            break
        if blue_batches%10==0:
            print ("eval for {} out of {}".format(blue_batches, blue_number_of_batches))

    del temp_gen
    return total_blue_score/float(blue_batches)

In [None]:
def train(model,data_file,val_file,train_size,val_size,val_step_size,epochs,words_replace_count,model_weights_file_name):
    if os.path.isfile(model_weights_file_name):
        print ("loading weights already present in {}".format(model_weights_file_name))
        model.load_weights(model_weights_file_name)
        print ("model weights loaded for further training")
            
    train_data = data_generator(data_file,words_replace_count, model)
    blue_scores = []
    best_blue_score_track = -1.0
    number_of_batches = math.ceil(train_size / float(128))
        
    for each_epoch in range(epochs):
        print ("running for epoch ",each_epoch)
        start_time = time.time()
        batches = 0
        for X_batch, Y_batch in train_data:
            model.fit(X_batch,Y_batch,batch_size=128,epochs=1)
            batches += 1
            if batches >= number_of_batches :
                break
            if batches%10==0:
                print ("training for {} out of {} for epoch {}".format(batches, number_of_batches, each_epoch))
                    
        end_time = time.time()
        print("time to train epoch ",end_time-start_time)

        blue_score_now = blue_score_calculator(model,val_file,val_size,val_step_size)
        blue_scores.append(blue_score_now)
        if best_blue_score_track < blue_score_now:
            best_blue_score_track = blue_score_now
            print ("saving model for blue score ",best_blue_score_track)
            model.save_weights(model_weights_file_name)
                
        with open("blue_scores.pickle", "wb") as output_file:
            pickle.dump(blue_scores, output_file)

In [None]:
def is_headline_end(word_index_list,current_prediction_position):
    if (word_index_list is None) or (len(word_index_list)==0):
        return False
    if word_index_list[current_prediction_position]==eos_tag_location or current_prediction_position>=max_length:
        return True
    return False

In [None]:
def process_word(prediction,word_position_index,top_k,X,prev_layer_log_prob):
    prediction = prediction[0]
    prediction_at_word_index = prediction[word_position_index]
    sorted_arg = prediction_at_word_index.argsort()
    top_probable_indexes = sorted_arg[::-1]
    top_probabilities = np.take(prediction_at_word_index,top_probable_indexes)
    log_probabilities = np.log(top_probabilities)
    log_probabilities[log_probabilities == -inf] = -sys.maxsize - 1
    log_probabilities = log_probabilities + prev_layer_log_prob
    assert len(log_probabilities)==len(top_probable_indexes)
    
    offset = max_len_desc+word_position_index+1
    ans = []
    count = 0 
    for i,j in zip(log_probabilities, top_probable_indexes):
        if j in X[max_len_desc+1:offset][-dont_repeat_word_in_last:]:
            continue
        if (word_position_index < min_head_line_gen) and (j in [empty_tag_location, unknown_tag_location, eos_tag_location]):
            continue
            
        next_input = np.concatenate((X[:offset], [j,]))
        next_input = next_input.reshape((1,next_input.shape[0]))
        
        if offset!=max_length:
            next_input = sequence.pad_sequences(next_input, maxlen=max_length, value=empty_tag_location, padding='post', truncating='post')
        next_input = next_input[0]
        ans.append((i,next_input))
        count = count + 1
        if count>=top_k:
            break
    return ans

In [None]:
def beam_search(model,X,top_k):
    prev_word_index_top_k = []
    curr_word_index_top_k = []
    done_with_pred = []
    data = X.reshape((1,X.shape[0]))
    prediction = model.predict(data,verbose=0)
    
    prev_word_index_top_k = process_word(prediction,0,top_k,X,0.0)
        
    for i in range(1,max_len_head):
        for j in range(len(prev_word_index_top_k)):
            probability_now, current_intput = prev_word_index_top_k[j]
            data = current_intput.reshape((1,current_intput.shape[0]))
            prediction = model.predict(data,verbose=0)
            next_top_k_for_curr_word = process_word(prediction,i,top_k,current_intput,probability_now)
            curr_word_index_top_k = curr_word_index_top_k + next_top_k_for_curr_word
                
        curr_word_index_top_k = sorted(curr_word_index_top_k,key=itemgetter(0),reverse=True)
        prev_word_index_top_k_temp = curr_word_index_top_k[:top_k]
        curr_word_index_top_k = []
        prev_word_index_top_k = []
        for each_proba, each_word_idx_list in prev_word_index_top_k_temp:
            offset = max_len_desc+i+1
            if is_headline_end(each_word_idx_list,offset):
                done_with_pred.append((each_proba, each_word_idx_list))
            else:
                prev_word_index_top_k.append((each_proba,each_word_idx_list))
            
    done_with_pred = sorted(done_with_pred,key=itemgetter(0),reverse=True)
    done_with_pred = done_with_pred[:top_k]
    return done_with_pred

In [None]:
def test(model, data_file_name, no_of_testing_sample, model_weights_file_name,top_k,output_file,seperator='#|#'):
    model.load_weights(model_weights_file_name)
    print ("model weights loaded")
    test_batch_size = 1
    test_data_generator = data_generator(data_file_name, number_words_to_replace=0, model=None,is_training=False)
    number_of_batches = math.ceil(no_of_testing_sample / float(test_batch_size))
        
    with codecs.open(output_file, 'w',encoding='utf8') as f:
        batches = 0
        for X_batch, Y_batch in tqdm(test_data_generator):
            X = X_batch[0]
            Y = Y_batch[0]
            assert X[max_len_desc]==eos_tag_location
            X[max_len_desc+1:]=empty_tag_location
            result = beam_search(model,X,top_k)
            list_of_word_indexes = result[0][1]
            list_of_words = indexes_to_words([list_of_word_indexes])[0]
            headline = u" ".join(list_of_words[max_len_desc+1:])
            f.write(Y+seperator+headline+"\n")
            batches += 1
            
            if batches >= number_of_batches :
                break

# Load data

In [None]:
stop = stopwords.words('english')

In [None]:
def tokenize(document):
    return word_tokenize(document)

In [None]:
def lower_case(tokens):
    return [word.lower() for word in tokens]
def remove_punctuation(tokens):
    return [re.sub(r'[^\w\s]', '', word) for word in tokens]
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop]

In [None]:
from bs4 import BeautifulSoup

In [None]:
def remove_url(data):
    return [re.sub(r'https://','', sentence) for sentence in data]
def remove_html(data):
    return [BeautifulSoup(sentence, 'html.parser').get_text() for sentence in data]
def remove_bracket(data):
    return [re.sub(r'[\([{})\]]','', sentence) for sentence in data]
def remove_digit(data):
    return [re.sub('[0-9]','', sentence) for sentence in data]
def remove_underscore(data):
    return [sentence.replace("_","") for sentence in data]

In [None]:
def process_data(text):
    text = remove_url(text)
    text = remove_html(text)
    text = remove_bracket(text)
    text = remove_digit(text)
    text = remove_underscore(text)

In [None]:
def create_data_file(df, file_name):
    df = df[df.notnull()]
    df.drop('url', axis=1)
    df = df.dropna(how='any')
    heads = df['title']
    descs = df['content']
    print("preprocessed")
    title_list = []
    content_list = []
    for i in heads:
        try:
            title = ftfy.fix_text(i)
            title_list.append(i)
        except:
            print(i)
            return
    for i in descs:
        try:
            title = ftfy.fix_text(i)
            content_list.append(i)
        except:
            print(i)
            return
    print("Beginning tokenization")
    tokenized_title = [tokenize(title) for title in title_list]
    tokenized_content = [tokenize(content) for content in content_list]
    tokenized_title = [remove_punctuation(title) for title in tokenized_title]
    tokenized_content = [remove_punctuation(content) for content in tokenized_content]
    print("Done tokenization")
    print("Beginning filtering")
    filtered_title = [remove_stopwords(lower_case(title)) for title in tokenized_title]
    filtered_content = [remove_stopwords(lower_case(content)) for content in tokenized_content]
    print("Done filtering")
    title_new = [' '.join(c for c in s if c not in string.punctuation) for s in filtered_title]
    content_new = [' '.join(c for c in s if c not in string.punctuation) for s in filtered_content]
    print("Cleared Punc")
    final_df = pd.DataFrame(
    {'heads': title_new,
     'descs': content_new,
    })
    print("Writing to file")
    final_df.to_pickle(file_name)
    return final_df

In [None]:
def write_to_file():
    df = pd.read_pickle('../input/preproc-articles/article1.pickle')
    print("created final df")
    train = df.iloc[:8000]
    validation = df.iloc[8000:10000]
    test = df.iloc[10000:12000]
    print("writing to files")
    train.to_pickle('train_data.pkl')
    validation.to_pickle('validation_data.pkl')
    test.to_pickle('test_data.pkl')

In [None]:
def load_data(file_path, output_name):
    df= pd.read_csv(file_path)
    final_df = create_data_file(df, output_name)

In [None]:
# load_data('../input/all-the-news/articles1.csv', 'article1.pickle')

In [None]:
write_to_file()

In [None]:
df = pd.read_csv('../input/all-the-news/articles2.csv')

In [None]:
df = df[['content', 'title']]

In [None]:
df.head()

In [None]:
# train(model=model, 
#     data_file='./train_data.pkl', 
#     val_file='./validation_data.pkl',
#     train_size=8000, 
#     val_size=2000,
#     val_step_size=128,
#     epochs=5,
#     words_replace_count=5,
#     model_weights_file_name='model_weights.h5')

In [None]:
model = create_model()

In [None]:
model.load_weights('../input/preproc-articles/model_weights.h5')

In [None]:
# test(model=test_model,
#     data_file_name='test_data.pkl',
#     no_of_testing_sample= 2000,
#     model_weights_file_name='model_weights.h5',
#     top_k=5,
#     output_file='test_output.txt')

In [None]:
def predict_convert_inputs(descriptions, headlines):

    X, y = [], []
    input_headline_idx = sentence2idx(headlines, True, max_len_head, True)
    predicted_headline_idx = sentence2idx(headlines, True, max_len_head, False)
    desc_idx = sentence2idx(descriptions, False, max_len_desc)
    assert len(input_headline_idx) == max_len_head - 1
    assert len(predicted_headline_idx) == max_len_head
    assert len(desc_idx) == max_len_desc + 1

    X.append(desc_idx + input_headline_idx)
    y.append(predicted_headline_idx)
        
    X, y = np.array(X), np.array(y)
    print("Convert descs and headlines to numpy arrays")
    return X,headlines

In [None]:
def predict_process_word(prediction,word_position_index,top_k,X,prev_layer_log_prob):
    prediction = prediction[0]
    prediction_at_word_index = prediction[word_position_index]
    sorted_arg = prediction_at_word_index.argsort()
    top_probable_indexes = sorted_arg[::-1]
    top_probabilities = np.take(prediction_at_word_index,top_probable_indexes)
    log_probabilities = np.log(top_probabilities)
    log_probabilities[log_probabilities == -inf] = -sys.maxsize - 1
    log_probabilities = log_probabilities + prev_layer_log_prob
    assert len(log_probabilities)==len(top_probable_indexes)
    
    offset = max_len_desc+word_position_index+1
    ans = []
    count = 0 
    for i,j in zip(log_probabilities, top_probable_indexes):
        if j in X[max_len_desc+1:offset][-dont_repeat_word_in_last:]:
            continue
        if (word_position_index < min_head_line_gen) and (j in [empty_tag_location, unknown_tag_location, eos_tag_location]):
            continue
            
        next_input = np.concatenate((X[:offset], [j,]))
        next_input = next_input.reshape((1,next_input.shape[0]))
        
        if offset!=max_length:
            next_input = sequence.pad_sequences(next_input, maxlen=max_length, value=empty_tag_location, padding='post', truncating='post')
        next_input = next_input[0]
        ans.append((i,next_input))
        count = count + 1
        if count>=top_k:
            break
    return ans

In [None]:
def predict_beam_search(X,top_k):
    prev_word_index_top_k = []
    curr_word_index_top_k = []
    done_with_pred = []
    data = X.reshape((1,X.shape[0]))
    
    prediction = model.predict(data,verbose=0)
    prev_word_index_top_k = predict_process_word(prediction,0,top_k,X,0.0)
        
    for i in range(1,max_len_head):
        for j in range(len(prev_word_index_top_k)):
            probability_now, current_intput = prev_word_index_top_k[j]
            data = current_intput.reshape((1,current_intput.shape[0]))
            prediction = model.predict(data,verbose=0)
            next_top_k_for_curr_word = predict_process_word(prediction,i,top_k,current_intput,probability_now)
            curr_word_index_top_k = curr_word_index_top_k + next_top_k_for_curr_word
                
        curr_word_index_top_k = sorted(curr_word_index_top_k,key=itemgetter(0),reverse=True)
        prev_word_index_top_k_temp = curr_word_index_top_k[:top_k]
        curr_word_index_top_k = []
        prev_word_index_top_k = []
        for each_proba, each_word_idx_list in prev_word_index_top_k_temp:
            offset = max_len_desc+i+1
            if is_headline_end(each_word_idx_list,offset):
                done_with_pred.append((each_proba, each_word_idx_list))
            else:
                prev_word_index_top_k.append((each_proba,each_word_idx_list))
            
    done_with_pred = sorted(done_with_pred,key=itemgetter(0),reverse=True)
    done_with_pred = done_with_pred[:top_k]
    return done_with_pred

In [None]:
def predict_title(descriptions,headlines,top_k,seperator='#|#'):
    X,Y = predict_convert_inputs(descriptions, headlines)
    X = X[0]
    Y = Y[0]
    assert X[max_len_desc]==eos_tag_location
    X[max_len_desc+1:]=empty_tag_location
    result = predict_beam_search(X,top_k)
    list_of_word_indexes = []
    list_of_word_indexes = result[0][1]
    list_of_words = indexes_to_words([list_of_word_indexes])[0]
    list_of_words.reverse()
    list_of_words = list_of_words[10:]
    print(list_of_words)
    headline = " ".join(list_of_words)
    return str(headline)

In [None]:
predict_title(descriptions=df['content'][7],
              headlines=df['title'][7],
    top_k=2)

In [None]:
df['content'][7]