In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import codecs, json

In [None]:
path = "/kaggle/input/summarization-set/full_cleaned.csv"

In [None]:
df = pd.read_csv(path)
df.head()
original_dataset_size = len(df["news"])

In [None]:
print(df.isnull().sum())
df.drop_duplicates(subset=['title', 'news'], inplace=True)
df["title_length"] = df["title"].apply(lambda x: len(x.split()))
df["news_length"] = df["news"].apply(lambda x: len(x.split()))
df = df[df["title_length"] > 2]
df = df[df["news_length"] >= 30]
df = df[df["news_length"] <= 400]

In [None]:
with codecs.open('/kaggle/input/model-output/kaggle/working/constants.json', encoding='utf-8') as const:
    CONSTANTS = json.load(const)
CONSTANTS

In [None]:
df["title_cut"] = df["title"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_title_length"]]))
df["news_cut"] = df["news"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_news_length"]]))

In [None]:
df["title_cut"] = df["title_cut"].apply(lambda x: 'sos ' + x + ' eos')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(df.drop(['title', 'title_cut', 'title_length', 'news_length', 'category'], axis=1), df.drop(['news', 'news_cut', 'title_length', 'news_length', 'category'], axis=1), test_size=0.2, random_state=21, shuffle=True)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=21, shuffle=True)

In [None]:
print(f"Length of dataset: {len(df['title'])}, in %: {len(df['title']) / len(df['title']) * 100}")
print(f"Length of training set: {len(X_train['news'])}, in %: {len(X_train['news']) / len(df['title']) * 100}")
print(f"Length of validation set: {len(X_val['news'])}, in %: {len(X_val['news']) / len(df['title']) * 100}")
print(f"Length of test set: {len(X_test['news'])}, in %: {len(X_test['news']) / len(df['title']) * 100}")

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import json, codecs
from tensorflow.keras.preprocessing.sequence import pad_sequences

with codecs.open('/kaggle/input/model-output/kaggle/working/X_tokenizer.json', encoding='utf-8') as f:
    data = json.load(f)
    X_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with codecs.open('/kaggle/input/model-output/kaggle/working/target_tokenizer.json', encoding='utf-8') as f:
    data = json.load(f)
    y_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

In [None]:
y_tokenizer.sequences_to_texts([[10, 12, 100, 100000, 10, 0]])

In [None]:
X_test_seq = X_tokenizer.texts_to_sequences(X_test["news_cut"])

X_test_pad_seq = pad_sequences(X_test_seq, maxlen=CONSTANTS["max_news_length"], padding='post')

In [None]:
X_voc_size = len(X_tokenizer.word_index) + 1
X_voc_size

In [None]:
y_test_seq = y_tokenizer.texts_to_sequences(y_test["title_cut"])

y_test_padded_seq = pad_sequences(y_test_seq, maxlen=CONSTANTS["max_title_length"], padding='post')

In [None]:
y_voc_size = len(y_tokenizer.word_index) + 1
y_voc_size

In [None]:
import pickle

In [None]:
with open('/kaggle/input/model-output/kaggle/working/train_history.pkl', "rb") as hist:
    history = pickle.load(hist)

In [None]:
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('training loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Input, Dense, TimeDistributed, Embedding, Concatenate
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input

In [None]:
# !unzip "/kaggle/input/pretrained/file.zip"

In [None]:
model = tf.keras.models.load_model('/kaggle/input/model-output/kaggle/working/Nepali_News_Headline_Gen_Model')

In [None]:
plot_model(model, to_file='inf_encoder_nepali_news_headline_generation_model.png', show_shapes=True, show_layer_names=True)

## INFERENCE MODEL

The training model won't work for us, since we are using teacher forcing to train our model.<br>

So during inference, we need to build our inference model from the start.

Lets take a look the layers in our model

In [None]:
[layer.name for layer in model.layers]

In [None]:
inf_encoder_inputs = model.get_layer('Encoder_Input').input

inf_encoder_embedding_layer = model.get_layer('News_Embedding')
inf_encoder_embeddings = inf_encoder_embedding_layer(inf_encoder_inputs)

inf_encoder_lstm1 = model.get_layer('Encoder_LSTM_1')
inf_encoder_lstm1_output, _, _, = inf_encoder_lstm1(inf_encoder_embeddings)

inf_encoder_lstm2 = model.get_layer('Encoder_LSTM_2')
inf_encoder_lstm2_output, _, _ = inf_encoder_lstm2(inf_encoder_lstm1_output)

inf_encoder_lstm3 = model.get_layer('Encoder_LSTM_3')
inf_encoder_lstm3_output, inf_state_h, inf_state_c = inf_encoder_lstm2(inf_encoder_lstm2_output)

inf_final_encoder_outputs = [inf_encoder_lstm3_output, inf_state_h, inf_state_c]

inf_encoder_model  = Model(inf_encoder_inputs, inf_final_encoder_outputs)

In [None]:
inf_decoder_inputs = model.get_layer('Decoder_Input').input

inf_final_encoder_output = Input(shape=(CONSTANTS["max_news_length"], CONSTANTS["latent_dim"],), name='Encoder_Final_Output')
inf_final_encoder_state_h = Input(shape=(CONSTANTS["latent_dim"],), name='Encoder_Final_Hidden_State')
inf_final_encoder_state_c = Input(shape=(CONSTANTS["latent_dim"],), name='Encoder_Final_Cell_State')
inf_final_encoder_states = [inf_final_encoder_state_h, inf_final_encoder_state_c]

inf_decoder_embedding_layer = model.get_layer('Title_Embedding')
inf_decoder_embeddings = inf_decoder_embedding_layer(inf_decoder_inputs)

inf_decoder_lstm = model.get_layer('Decoder_LSTM')
inf_decoder_lstm_output, inf_decoder_lstm_state_h, inf_decoder_lstm_state_c = inf_decoder_lstm(inf_decoder_embeddings, initial_state=inf_final_encoder_states)

inf_final_decoder_states = [inf_decoder_lstm_state_h, inf_decoder_lstm_state_c]

inf_bahdanau_attention_layer = model.get_layer('Bahdanau_Attention')
inf_context_vectors, _ = inf_bahdanau_attention_layer([inf_final_encoder_output, inf_decoder_lstm_output])

inf_decoder_concat_layer = model.get_layer('Concatenate_Layer')
inf_decoder_concat_output = inf_decoder_concat_layer([inf_decoder_lstm_output, inf_context_vectors])

inf_dense_layer = model.get_layer('Softmax_Layer')
inf_final_decoder_output = inf_dense_layer(inf_decoder_concat_output)

inf_decoder_model  = Model([inf_decoder_inputs, inf_final_encoder_output] + inf_final_encoder_states, [inf_final_decoder_output] + inf_final_decoder_states)

In [None]:
import string
import re
from nltk.corpus import stopwords
nep_stopwrods = stopwords.words("nepali")

In [None]:
def remove_emojis_english_and_numbers(data):
    '''
    Removes emojis, non-nepali texts and numbers from the given text
    '''
    # Removes emoji from given data
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    res = re.sub(emoj, '', data)
    res = re.sub('[0-9]+', '', res)
    return re.sub('[a-zA-Z]', '', res)

In [None]:
def preprocess_text(data, is_title=False):
    if type(data) == float:
        return data
    data = data.replace("-", " ").replace("—", " ").replace("‘", " ").replace("’", " ").replace("।", " ").replace("–", " ").replace("“", " ").replace("”", " ") .replace("\n", " ").replace("–", " ").replace("ः", " ")
    no_extra_spaces = " ".join(data.split())
    no_emoji_english_numbers = remove_emojis_english_and_numbers(no_extra_spaces)
    no_punc = "".join([char for char in no_emoji_english_numbers if char not in (string.punctuation)])
    extra = " ".join(no_punc.split())
    # Remove stopwords from title only
    if not is_title:
        no_stopwords = [word for word in extra.split() if word.strip() not in nep_stopwrods]
        return " ".join(no_stopwords)
    else:
        return extra
print("नेपाल क्रिकेट सङ्घ (क्यान) ले बन्द प्रशिक्षणका")
preprocess_text("नेपाल क्रिकेट सङ्घ (क्यान) ले बन्द प्रशिक्षणका")

## COMPUTE BLEU SCORE FOR THE TEST SET

In [None]:
def sequence_to_words(sequence, tokenizer):
    words = tokenizer.sequences_to_texts([sequence])[0]
    return " ".join(words.split()[1:-1])

In [None]:
y_tokenizer.word_index['विनिमय']

In [None]:
y_tokenizer.index_word[20]

In [None]:
def greedy_summarize(news_sequence, X_tokenizer, y_tokenizer, max_title_length=21, sos_token="sos", eos_token="eos"):
        
    encoder_output, h_t_minus_1, c_t_minus_1 = inf_encoder_model.predict(news_sequence)

    title_sequence = []
    
    # Generate empty target sequence of length 1.
    word_n_minus_1 = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    word_n_minus_1[0, 0] = y_tokenizer.word_index[sos_token]

    # Decode the sequence one token at a time.
    for i in range(max_title_length):
        
        # Determine the next word.
        decoder_output, h_t, c_t = inf_decoder_model.predict([word_n_minus_1, encoder_output, h_t_minus_1, c_t_minus_1])
        prob_dist = decoder_output[0][-1]
        word_n_index = np.argmax(prob_dist)
        if word_n_index == 0:
            h_t_minus_1, c_t_minus_1 = h_t, c_t
            continue
            
        word_n = y_tokenizer.index_word[word_n_index]
        
        if word_n == eos_token:
            break
            
        # Update the previous sequence (of length 1).
        word_n_minus_1 = np.zeros((1,1))
        word_n_minus_1[0, 0] = word_n_index
        
        title_sequence.append(word_n)
        
#         print(title_sequence)
        
        # Update the internal states for the next time step t+1.
        h_t_minus_1, c_t_minus_1 = h_t, c_t

    # Return the generated sequence as sentence
    return " ".join(title_sequence)

In [None]:
def summarize_with_beam_search(news_sequence, X_tokenizer, y_tokenizer, beam_width=3, alpha=0.8, sos_token='sos', eos_token='eos', max_title_length=21):

    encoder_output, h_t_minus_1, c_t_minus_1 = inf_encoder_model.predict(news_sequence)
    
    # Initialize the beam of sequences.
    # Initially we start with the SOS_TOKEN
    beam = [[0.0, [y_tokenizer.word_index[sos_token]]]]
    
    # Decode the sequence one token at a time.
    for i in range(max_title_length):

        # Expand the beam.
        expanded_beam = []
        for log_probability_score, sequence in beam:
            if (sequence[-1] != y_tokenizer.word_index[eos_token]):
                
                # Generate empty target sequence of length 1.
                word_n_minus_1 = np.zeros((1,1))
    
                # Populate the first word of target sequence with the start word.
                word_n_minus_1[0, 0] = sequence[-1]
                
                decoder_output, h_t, c_t = inf_decoder_model.predict([word_n_minus_1, encoder_output, h_t_minus_1, c_t_minus_1])

                prob_dist = decoder_output[0, -1, :]

                # Generate all possible next tokens for the sequence.
                for word_n_index in range(len(prob_dist)):
                    expanded_beam.append([log_probability_score + np.log(prob_dist[word_n_index]), sequence + [word_n_index]])

        # Prune the beam to get the top-K
        beam = sorted(expanded_beam, key=lambda x: x[0], reverse=True)[:beam_width]
        
        # Check if all of the top-K sequences have encountered the EOS token.
        # Or all of the top-K sequences have length > max_title_length
        if all(sequence[-1] == y_tokenizer.word_index[eos_token] for prob, sequence in beam):
            ''' This section indicates the top-K sequences has been generated '''
            ''' Finally, we perform length normalization on the log proability score of each sequence before exiting '''
            for i in range(len(beam)):
                beam[i][1] = beam[i][1][1:] # Remove the SOS_TOKEN from the start
                beam[i][0] /= (len(beam[i][1])**alpha) # Perform length normalization       
            beam = sorted(beam, key=lambda x: x[0], reverse=True)
            break

        # Update the internal states for the next time step t+1.
        h_t_minus_1, c_t_minus_1 = h_t, c_t

    # Return the sequence with the highest score from the beam as sentence.
    return sequence_to_words(beam[0][1], y_tokenizer)

In [None]:
# def summarize_with_beam_search_ap_rp_pruning(news_sequence, X_tokenizer, y_tokenizer, beam_width=3, alpha=0.8, sos_token='sostok', eos_token='eostok', rp=0.5, ap=2.5, max_title_length=21):

#     encoder_output, h_t_minus_1, c_t_minus_1 = inf_encoder_model.predict(news_sequence)

#     # Initialize the beam of sequences.
#     # Initially we start with the SOS_TOKEN
#     beam = [(0.0, [y_tokenizer.word_index[sos_token]])]

#     # Decode the sequence one token at a time.
#     for i in range(max_title_length):

#         # Expand the beam.
#         expanded_beam = []
#         for log_probability_score, sequence in beam:
#             if (sequence[-1] != y_tokenizer.word_index[eos_token]):
                
#                 # Generate empty target sequence of length 1.
#                 word_n_minus_1 = np.zeros((1,1))
    
#                 # Populate the first word of target sequence with the start word.
#                 word_n_minus_1[0, 0] = sequence[-1]
                
#                 decoder_output, h_t, c_t = inf_decoder_model.predict([word_n_minus_1, encoder_output, h_t_minus_1, c_t_minus_1])

#                 prob_dist = decoder_output[0, -1, :]

#                 # Generate all possible next tokens for the sequence.
#                 for word_n_index in range(len(prob_dist)):
#                     expanded_beam.append((log_probability_score + np.log(prob_dist[word_n_index]), sequence + [word_n_index]))

#         # Prune the beam to get the top-K
#         beam = sorted(expanded_beam, key=lambda x: x[0], reverse=True)[:beam_width]
        
#         # Check if all of the top-K sequences have encountered the EOS token.
#         # Or all of the top-K sequences have length > max_title_length
#         if all(sequence[-1] == y_tokenizer.word_index[eos_token] for prob, sequence in beam):
#             ''' This section indicates the top-K sequences has been generated '''
#             ''' Finally, we perform length normalization on the log proability score of each sequence before exiting '''
#             for i in range(len(beam)):
#                 beam[i][1] = beam[i][1][1:] # Remove the SOS_TOKEN from the start
#                 beam[i][0] /= (len(beam[i][1])**alpha) # Perform length normalization       
#             beam = sorted(beam, key=lambda x: x[0], reverse=True)
#             break
        
#         ''' Prune the beam using Relative Threshold Pruning and Absolute Threshold Pruning '''
#         max_candidate_score = beam[0][0]
        
#         ''' Relative Threshold Pruning '''
#         beam = [(score, sequence) for score, sequence in beam if score > ((1+rp) * max_candidate_score)]
        
#         ''' Absolute Threshold Pruning '''
#         beam = [(score, sequence) for score, sequence in beam if score > (max_candidate_score - ap)]
        
#         print(len(beam))
            
#         # Update the internal states for the next time step t+1.
#         h_t_minus_1, c_t_minus_1 = h_t, c_t

#     # Return the sequence with the highest score from the beam as sentence.
#     return sequence_to_words(beam[0][1], y_tokenizer)

In [None]:
news = []
reference_titles = []
predicted_titles_greedy = []
predicted_titles_beam = []
predicted_titles_beam_prune = []


for index, each_seq in enumerate(X_test_pad_seq[:10000]):
    news.append(sequence_to_words(each_seq, tokenizer=X_tokenizer))
    reference_titles.append(sequence_to_words(y_test_padded_seq[index], tokenizer=y_tokenizer))

In [None]:
y_test.head()

In [None]:
list(y_test["title_cut"])[5]

In [None]:
y_tokenizer.sequences_to_texts([y_test_padded_seq[5]])[0].split()[1:-1]

In [None]:
y_test_padded_seq[5]

In [None]:
y_tokenizer.word_index["घट्दै"]

In [None]:
import time

In [None]:
time_taken = {}

In [None]:
''' GREEDY DECODING '''
greedy_start = time.time()
for index, each_seq in enumerate(X_test_pad_seq[:10000]):
    predicted_titles_greedy.append(greedy_summarize(each_seq.reshape(1,CONSTANTS["max_news_length"]), X_tokenizer, y_tokenizer))
greedy_end = time.time()
time_taken["GREEDY"] = (greedy_end - greedy_start) / 60 # In minutes

In [None]:
list(zip(reference_titles, predicted_titles_greedy))[:5]

In [None]:
''' BEAM SEARCH DECODING '''
beam_start = time.time()
for index, each_seq in enumerate(X_test_pad_seq[:10000]):
    predicted_titles_beam.append(summarize_with_beam_search(each_seq.reshape(1,CONSTANTS["max_news_length"]), X_tokenizer, y_tokenizer))
beam_end = time.time()
time_taken["BEAM"] = (beam_end - beam_start) / 60 # In minutes    

In [None]:
list(zip(reference_titles, predicted_titles_beam))[:5]

In [None]:
# ''' BEAM SEARCH DECODING WITH PRUNING '''
# beam_prune_start = time.time()
# for index, each_seq in enumerate(X_train_pad_seq[:10]):
#     predicted_titles_beam_prune.append(summarize_with_beam_search_ap_rp_pruning(each_seq.reshape(1,max_news_length), X_tokenizer, y_tokenizer))
# beam_prune_end = time.time()
# time_taken["BEAM PRUNE"] = (beam_prune_end - beam_prune_start) / 60 # In minutes    

In [None]:
# list(zip(reference_titles, predicted_titles_beam_prune))

In [None]:
time_taken

## COMPUTE BLEU AND ROUGE SCORES FOR GENERATED TITLES

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
smoothie = SmoothingFunction().method4

def compute_bleu_metric(reference: list, predicted: list) -> float:
    '''
    Computes the BLEU metric given a set of refraence and predicted sequences of words
    
    :param list(str) refrence: List of refrence sequences. Eg: ["Hi I am jane doe", "i am from italy"]    
    :param list(str) predicted: List of predicted sequences. Eg: ["i jane doe", "i was born and raised in sicily"]
    :return: The BLEU score for the predicted sequences.
    '''
    return corpus_bleu([[value.split()] for value in reference], [value.split() for value in predicted], smoothing_function=smoothie, weights=(0.25, 0.25, 0.25, 0.25))

In [None]:
bleu_scores = {}

In [None]:
np.array(predicted_titles_beam).shape

In [None]:
bleu_scores["GREEDY"] = compute_bleu_metric(reference_titles, predicted_titles_greedy)
bleu_scores["BEAM"] = compute_bleu_metric(reference_titles, predicted_titles_beam)
# bleu_scores["BEAM PRUNE"] = compute_bleu_metric(reference_titles, predicted_titles_beam_prune)

In [None]:
bleu_scores

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge
rouge = Rouge()

def compute_rouge_metric(reference: list, predicted: list) -> dict:
    '''
    Computes Rogue-1, Rouge-2 and Rouge-L metric given a set of refraence and predicted sequences of words
    
    :param list(str) refrence: List of refrence sequences. Eg: ["Hi I am jane doe", "i am from italy"]    
    :param list(str) predicted: List of predicted sequences. Eg: ["i jane doe", "i was born and raised in sicily"]
    :return: The rouge-1,2,L scores for the predicted sequences.
    '''
    scores = rouge.get_scores(predicted, reference, avg=True)
    return {
        "Rouge-1": scores['rouge-1']['f'],
        "Rouge-2": scores['rouge-2']['f'],
        "Rouge-L": scores['rouge-l']['f']
    }

In [None]:
rouge_scores = {}

In [None]:
rouge_scores["GREEDY"] = compute_rouge_metric(reference_titles, predicted_titles_greedy)
rouge_scores["BEAM"] = compute_rouge_metric(reference_titles, predicted_titles_beam)
# rouge_scores["BEAM PRUNE"] = compute_rouge_metric(reference_titles, predicted_titles_beam_prune)

In [None]:
rouge_scores