# Importing the basic libaries

In [265]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Accessing the dataset from google drive

In [266]:
train_path = "/kaggle/input/nepali-summarization-set/full_cleaned_train.csv"
val_path = "/kaggle/input/nepali-summarization-set/full_cleaned_val.csv"
test_path = "/kaggle/input/nepali-summarization-set/full_cleaned_test.csv"

# Reading the dataset

In [267]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)
original_train_size = len(df_train)
original_val_size = len(df_val)
original_test_size = len(df_test)

In [268]:
df_train.head()

Unnamed: 0,title,news_stopwords,news_no_stopwords,category
0,चिया बगानले कर नतिरेपछि सङ्घीय सरकारको ध्यानाक...,इलाम र झापाका सरकारी चिया बगान लिजमा लिएको पक्...,इलाम झापाका सरकारी चिया बगान लिजमा लिएको पक्षल...,समाज
1,बिजुलीको पोल सार्न नयाँ मापदण्ड तयार,नेपाल विद्युत् प्राधिकरणले ग्राहकलाई विद्युत् ...,नेपाल विद्युत् प्राधिकरणले ग्राहकलाई विद्युत् ...,अर्थ / वाणिज्य
2,नेमारविनै पीएसजी विजयी,लिग लिडर स्टार पेरिस सेन्ट जर्मन ले स्ट्राइकर ...,लिग लिडर स्टार पेरिस सेन्ट जर्मन स्ट्राइकर नेम...,खेलकुद
3,कोरोनाको चुनौतीलाई सम्बोधन गर्न जापानद्वारा ७०...,सरकार र सत्तारुढ पार्टीको बैठकमा सम्बोधन गर्दै...,सरकार सत्तारुढ पार्टीको बैठकमा सम्बोधन प्रधानम...,अर्थ / वाणिज्य
4,युक्रेन द्वन्द्वको असर युरोपमा मात्र सीमित छैन...,युक्रेन द्वन्द्वको असर युरोपमा मात्र सीमित नरह...,युक्रेन द्वन्द्वको असर युरोपमा सीमित नरहेको भन...,विश्व


### Remove all entries where title has numbers

In [269]:
df_train = df_train[~df_train["title"].str.contains("०")]
df_train = df_train[~df_train["title"].str.contains("१")]
df_train = df_train[~df_train["title"].str.contains("२")]
df_train = df_train[~df_train["title"].str.contains("३")]
df_train = df_train[~df_train["title"].str.contains("४")]
df_train = df_train[~df_train["title"].str.contains("५")]
df_train = df_train[~df_train["title"].str.contains("६")]
df_train = df_train[~df_train["title"].str.contains("७")]
df_train = df_train[~df_train["title"].str.contains("८")]
df_train = df_train[~df_train["title"].str.contains("९")]
len(df_train)

789

In [270]:
df_val = df_val[~df_val["title"].str.contains("०")]
df_val = df_val[~df_val["title"].str.contains("१")]
df_val = df_val[~df_val["title"].str.contains("२")]
df_val = df_val[~df_val["title"].str.contains("३")]
df_val = df_val[~df_val["title"].str.contains("४")]
df_val = df_val[~df_val["title"].str.contains("५")]
df_val = df_val[~df_val["title"].str.contains("६")]
df_val = df_val[~df_val["title"].str.contains("७")]
df_val = df_val[~df_val["title"].str.contains("८")]
df_val = df_val[~df_val["title"].str.contains("९")]
len(df_val)

151

In [271]:
df_test = df_test[~df_test["title"].str.contains("०")]
df_test = df_test[~df_test["title"].str.contains("१")]
df_test = df_test[~df_test["title"].str.contains("२")]
df_test = df_test[~df_test["title"].str.contains("३")]
df_test = df_test[~df_test["title"].str.contains("४")]
df_test = df_test[~df_test["title"].str.contains("५")]
df_test = df_test[~df_test["title"].str.contains("६")]
df_test = df_test[~df_test["title"].str.contains("७")]
df_test = df_test[~df_test["title"].str.contains("८")]
df_test = df_test[~df_test["title"].str.contains("९")]
len(df_test)

163

- The dataset has already been made free of english words, arabic numerals, emojis, and special characters and null values.<br>
- Further, in one instance of the cleaned articles, stopwords has been removed while in another instance the stopwords have been kept intact in the news articles.
- The headlines however are already very short and the stopwords are integral in keeping the headline concise so stopwords are kept as they were in headlines.

In [272]:
print(df_train.isnull().sum())
print(df_val.isnull().sum())
print(df_test.isnull().sum())

title                0
news_stopwords       0
news_no_stopwords    0
category             0
dtype: int64
title                0
news_stopwords       0
news_no_stopwords    0
category             0
dtype: int64
title                0
news_stopwords       0
news_no_stopwords    0
category             0
dtype: int64


There are no null values in train, validation and test.

#### Check for any data with empty title or news

In [273]:
df_train[df_train["title"] == ""]

Unnamed: 0,title,news_stopwords,news_no_stopwords,category


In [274]:
df_val[df_val["title"] == ""]

Unnamed: 0,title,news_stopwords,news_no_stopwords,category


In [275]:
df_test[df_test["title"] == ""]

Unnamed: 0,title,news_stopwords,news_no_stopwords,category


When cleaning the data, to preserve words like दु:ख:, पुन:, I didn't remove any : charcters.<br>
However, using the : might affect the performance of a Pointer Generator Network since "गर्नेछु" and "गर्नेछु:" are not treated the same.<br>
So lets remove the : character too

In [276]:
def remove_colon(text):
    text = "".join([char for char in text if char != ":"])
    return text

In [277]:
df_train["news_stopwords"] = df_train["news_stopwords"].apply(remove_colon)
df_train["news_no_stopwords"] = df_train["news_no_stopwords"].apply(remove_colon)
df_train["title"] = df_train["title"].apply(remove_colon)
df_val["news_stopwords"] = df_val["news_stopwords"].apply(remove_colon)
df_val["news_no_stopwords"] = df_val["news_no_stopwords"].apply(remove_colon)
df_val["title"] = df_val["title"].apply(remove_colon)
df_test["news_stopwords"] = df_test["news_stopwords"].apply(remove_colon)
df_test["news_no_stopwords"] = df_test["news_no_stopwords"].apply(remove_colon)
df_test["title"] = df_test["title"].apply(remove_colon)

Now that we have completed our data preprocessing part, lets analyze the length of news and titles

This is necessary because both news and titles are of variable length. Some are too big while some are too small. <br>

Our network only accepts fixed sized inputs. So we need to create a threshold of how many words to take from each news and heading to train our model.<br>

#### Computing the length of news and titles

We will consider the stopwords in the news articles when creating the model

In [278]:
df_train["title_length"] = df_train["title"].apply(lambda x: len(x.split()))
df_train["news_length"] = df_train["news_stopwords"].apply(lambda x: len(x.split()))

In [279]:
df_val["title_length"] = df_val["title"].apply(lambda x: len(x.split()))
df_val["news_length"] = df_val["news_stopwords"].apply(lambda x: len(x.split()))

In [280]:
df_test["title_length"] = df_test["title"].apply(lambda x: len(x.split()))
df_test["news_length"] = df_test["news_stopwords"].apply(lambda x: len(x.split()))

All the articles have different length of news and headlines.<br>
However, when training our model, we need all instances to have same length news.<br>
So lets acheive this by analyzing the news and headlines length

### Analyzing titles

In [281]:
df_train["title_length"].describe()

count    789.000000
mean       6.974651
std        2.415447
min        2.000000
25%        5.000000
50%        7.000000
75%        9.000000
max       14.000000
Name: title_length, dtype: float64

In [282]:
df_val["title_length"].describe()

count    151.000000
mean       7.006623
std        2.585850
min        3.000000
25%        5.000000
50%        7.000000
75%        9.000000
max       15.000000
Name: title_length, dtype: float64

In [283]:
df_test["title_length"].describe()

count    163.000000
mean       7.165644
std        2.509891
min        2.000000
25%        5.000000
50%        7.000000
75%        9.000000
max       15.000000
Name: title_length, dtype: float64

For the title, we will consider the length of the longest title in all of our dataset i.e 19

In [284]:
CONSTANTS = {}
CONSTANTS["max_title_length"] = 19

In [285]:
len(df_train[df_train["title_length"] == 1])

0

In [286]:
len(df_train[df_train["title_length"] == 2])

2

Lets remove the articles where the title only consist of at most 2 words

In [287]:
df_train = df_train[df_train["title_length"] > 2]
df_val = df_val[df_val["title_length"] > 2]
df_test = df_test[df_test["title_length"] > 2]

#### Analyzing news

In [288]:
df_train["news_length"].describe()

count     787.000000
mean      216.780178
std       183.554718
min        24.000000
25%       107.000000
50%       165.000000
75%       268.000000
max      2601.000000
Name: news_length, dtype: float64

In [289]:
df_val["news_length"].describe()

count     151.000000
mean      214.788079
std       170.165003
min        44.000000
25%       107.500000
50%       164.000000
75%       285.500000
max      1358.000000
Name: news_length, dtype: float64

In [290]:
df_test["news_length"].describe()

count     161.000000
mean      228.229814
std       182.270481
min        39.000000
25%       110.000000
50%       175.000000
75%       285.000000
max      1192.000000
Name: news_length, dtype: float64

Now lets remove very short news(length less than 30)

In [291]:
print(len(df_train[df_train["news_length"] < 30]))

2


Lets check the % of articles that are less than 30

In [292]:
df_train = df_train[df_train["news_length"] >= 30]
df_val = df_val[df_val["news_length"] >= 30]
df_test = df_test[df_test["news_length"] >= 30]

For news, we will use the first 512 words to predict the title

In [293]:
len(df_train[df_train["news_length"] > 512]), len(df_val[df_val["news_length"] > 512]), len(df_test[df_test["news_length"] > 512])

(53, 6, 13)

In [294]:
df_train = df_train[df_train["news_length"] <= 512]
df_val = df_val[df_val["news_length"] <= 512]
df_test = df_test[df_test["news_length"] <= 512]

In [295]:
print("Size of original training dataset:", original_train_size)
print("Size of training dataset after cleaning:", len(df_train))
print("% of dataset removed:", (original_train_size - len(df_train)) / original_train_size * 100)
print("% of datset remaining:", len(df_train) / original_train_size * 100)

Size of original training dataset: 1000
Size of training dataset after cleaning: 732
% of dataset removed: 26.8
% of datset remaining: 73.2


In [296]:
print("Size of original validation dataset:", original_val_size)
print("Size of validation dataset after cleaning:", len(df_val))
print("% of dataset removed:", (original_val_size - len(df_val)) / original_val_size * 100)
print("% of datset remaining:", len(df_val) / original_val_size * 100)

Size of original validation dataset: 200
Size of validation dataset after cleaning: 145
% of dataset removed: 27.500000000000004
% of datset remaining: 72.5


In [297]:
print("Size of original testing dataset:", original_test_size)
print("Size of testing dataset after cleaning:", len(df_test))
print("% of dataset removed:", (original_test_size - len(df_test)) / original_test_size * 100)
print("% of datset remaining:", len(df_test) / original_test_size * 100)

Size of original testing dataset: 200
Size of testing dataset after cleaning: 148
% of dataset removed: 26.0
% of datset remaining: 74.0


In [298]:
CONSTANTS["max_news_length"] = 512

#### Now we remove the extra part of the news and headlines so that all of them have length of $<=$ their cooresponding allowed length

In [299]:
df_train["title_cut"] = df_train["title"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_title_length"]]))
df_train["news_cut"] = df_train["news_stopwords"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_news_length"]]))

In [300]:
df_val["title_cut"] = df_val["title"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_title_length"]]))
df_val["news_cut"] = df_val["news_stopwords"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_news_length"]]))

In [301]:
df_test["title_cut"] = df_test["title"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_title_length"]]))
df_test["news_cut"] = df_test["news_stopwords"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_news_length"]]))

##### Now lets add the start and end token to our headlines

In [302]:
df_train["title_cut"] = df_train["title_cut"].apply(lambda x: 'sos ' + x + ' eos')

In [303]:
df_val["title_cut"] = df_val["title_cut"].apply(lambda x: 'sos ' + x + ' eos')

In [304]:
df_test["title_cut"] = df_test["title_cut"].apply(lambda x: 'sos ' + x + ' eos')

In [305]:
df_train.head()

Unnamed: 0,title,news_stopwords,news_no_stopwords,category,title_length,news_length,title_cut,news_cut
0,चिया बगानले कर नतिरेपछि सङ्घीय सरकारको ध्यानाक...,इलाम र झापाका सरकारी चिया बगान लिजमा लिएको पक्...,इलाम झापाका सरकारी चिया बगान लिजमा लिएको पक्षल...,समाज,9,282,sos चिया बगानले कर नतिरेपछि सङ्घीय सरकारको ध्य...,इलाम र झापाका सरकारी चिया बगान लिजमा लिएको पक्...
1,बिजुलीको पोल सार्न नयाँ मापदण्ड तयार,नेपाल विद्युत् प्राधिकरणले ग्राहकलाई विद्युत् ...,नेपाल विद्युत् प्राधिकरणले ग्राहकलाई विद्युत् ...,अर्थ / वाणिज्य,6,328,sos बिजुलीको पोल सार्न नयाँ मापदण्ड तयार eos,नेपाल विद्युत् प्राधिकरणले ग्राहकलाई विद्युत् ...
2,नेमारविनै पीएसजी विजयी,लिग लिडर स्टार पेरिस सेन्ट जर्मन ले स्ट्राइकर ...,लिग लिडर स्टार पेरिस सेन्ट जर्मन स्ट्राइकर नेम...,खेलकुद,3,130,sos नेमारविनै पीएसजी विजयी eos,लिग लिडर स्टार पेरिस सेन्ट जर्मन ले स्ट्राइकर ...
4,युक्रेन द्वन्द्वको असर युरोपमा मात्र सीमित छैन...,युक्रेन द्वन्द्वको असर युरोपमा मात्र सीमित नरह...,युक्रेन द्वन्द्वको असर युरोपमा सीमित नरहेको भन...,विश्व,8,147,sos युक्रेन द्वन्द्वको असर युरोपमा मात्र सीमित...,युक्रेन द्वन्द्वको असर युरोपमा मात्र सीमित नरह...
5,राष्ट्रपतिको स्वास्थ्यमा क्रमिक सुधार,भारतको नयाँ दिल्लीमा उपचाररत राष्ट्रपति रामचन्...,भारतको दिल्लीमा उपचाररत राष्ट्रपति रामचन्द्र प...,स्वास्थ्य,4,84,sos राष्ट्रपतिको स्वास्थ्यमा क्रमिक सुधार eos,भारतको नयाँ दिल्लीमा उपचाररत राष्ट्रपति रामचन्...


#### Now lets build our vocabulary  and convert our words to integers

In [306]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### First we build the vocabulary for the news

In [307]:
# CONSTANTS["news_voc_size"] = 100000

In [308]:
# X_tokenizer = Tokenizer(oov_token="unk", num_words=100000)
X_tokenizer = Tokenizer(oov_token="unk")
X_tokenizer.fit_on_texts(list(df_train["news_cut"]))

In [309]:
CONSTANTS["news_voc_size"] = len(X_tokenizer.word_index) + 1

In [310]:
X_tokenizer.texts_to_sequences(["माइसंसारलाई पठाउनुस्"])
X_tokenizer.sequences_to_texts([[1, 1]])

['unk unk']

Now we convert our words to integers

In [311]:
X_train_seq = X_tokenizer.texts_to_sequences(df_train["news_cut"])
X_val_seq = X_tokenizer.texts_to_sequences(df_val["news_cut"])
X_test_seq = X_tokenizer.texts_to_sequences(df_test["news_cut"])

Finally we will pad our sequences so that all the inputs are of same length

In [312]:
X_train_pad_seq = pad_sequences(X_train_seq, maxlen=CONSTANTS["max_news_length"], padding='post')
X_val_pad_seq = pad_sequences(X_val_seq, maxlen=CONSTANTS["max_news_length"], padding='post')
X_test_pad_seq = pad_sequences(X_test_seq, maxlen=CONSTANTS["max_news_length"], padding='post')

#### Now for the headlines

We will perform the same operations as the news. However, we will convert our sequences to appropriate format for teacher forcing before padding them.

In [313]:
# CONSTANTS["title_voc_size"] = 50000

In [314]:
y_tokenizer = Tokenizer(oov_token="unk")
y_tokenizer.fit_on_texts(df_train["title_cut"])

In [315]:
CONSTANTS["title_voc_size"] = len(y_tokenizer.word_index) + 1

In [316]:
import copy

In [317]:
y_tokenizer.texts_to_sequences(["a sos eos"])

[[1, 2, 3]]

In [318]:
X_train_pad_seq_extended = []
y_train_pad_seq_extended = []

for news, title in zip(df_train["news_cut"], df_train["title_cut"]):
    y_tok_copy_train = copy.deepcopy(y_tokenizer)
    y_tok_copy_train.fit_on_texts([news])
    X_train_pad_seq_extended.append(pad_sequences(y_tok_copy_train.texts_to_sequences([news]), maxlen=CONSTANTS["max_news_length"], padding='post'))
    y_train_pad_seq_extended.append(pad_sequences(y_tok_copy_train.texts_to_sequences([title]), maxlen=CONSTANTS["max_title_length"], padding='post'))
    
X_train_pad_seq_extended = np.squeeze(np.array(X_train_pad_seq_extended), 1)
y_train_pad_seq_extended = np.squeeze(np.array(y_train_pad_seq_extended), 1)

In [319]:
X_train_pad_seq_extended.shape, y_train_pad_seq_extended.shape

((732, 512), (732, 19))

In [320]:
X_val_pad_seq_extended = []
y_val_pad_seq_extended = []

for news, title in zip(df_val["news_cut"], df_val["title_cut"]):
    y_tok_copy_val = copy.deepcopy(y_tokenizer)
    y_tok_copy_val.fit_on_texts([news])
    X_val_pad_seq_extended.append(pad_sequences(y_tok_copy_val.texts_to_sequences([news]), maxlen=CONSTANTS["max_news_length"], padding='post'))
    y_val_pad_seq_extended.append(pad_sequences(y_tok_copy_val.texts_to_sequences([title]), maxlen=CONSTANTS["max_title_length"], padding='post'))

X_val_pad_seq_extended = np.squeeze(np.array(X_val_pad_seq_extended), 1)
y_val_pad_seq_extended = np.squeeze(np.array(y_val_pad_seq_extended), 1)

In [321]:
X_val_pad_seq_extended.shape, y_val_pad_seq_extended.shape

((145, 512), (145, 19))

In [322]:
X_test_pad_seq_extended = []
y_test_pad_seq_extended = []

for news, title in zip(df_test["news_cut"], df_test["title_cut"]):
    y_tok_copy_test = copy.deepcopy(y_tokenizer)
    y_tok_copy_test.fit_on_texts([news])
    X_test_pad_seq_extended.append(pad_sequences(y_tok_copy_test.texts_to_sequences([news]), maxlen=CONSTANTS["max_news_length"], padding='post'))
    y_test_pad_seq_extended.append(pad_sequences(y_tok_copy_test.texts_to_sequences([title]), maxlen=CONSTANTS["max_title_length"], padding='post'))
    
X_test_pad_seq_extended = np.squeeze(np.array(X_test_pad_seq_extended), 1)
y_test_pad_seq_extended = np.squeeze(np.array(y_test_pad_seq_extended), 1)

In [323]:
X_test_pad_seq_extended.shape, y_test_pad_seq_extended.shape

((148, 512), (148, 19))

In [324]:
y_train_seq = y_tokenizer.texts_to_sequences(df_train["title_cut"])
y_val_seq = y_tokenizer.texts_to_sequences(df_val["title_cut"])
y_test_seq = y_tokenizer.texts_to_sequences(df_test["title_cut"])

In [325]:
y_train_padded_seq = pad_sequences(y_train_seq,  maxlen=CONSTANTS["max_title_length"], padding='post')
y_val_padded_seq = pad_sequences(y_val_seq, maxlen=CONSTANTS["max_title_length"], padding='post')
y_test_padded_seq = pad_sequences(y_test_seq, maxlen=CONSTANTS["max_title_length"], padding='post')

## Teacher Forcing

Now we will convert our sequences to appropriate format for teacher forcing

In [326]:
y_train_input = np.array([seq[:-1] for seq in y_train_padded_seq])
y_train_target = np.array([seq[1:] for seq in y_train_pad_seq_extended])
y_val_input = np.array([seq[:-1] for seq in y_val_padded_seq])
y_val_target = np.array([seq[1:] for seq in y_val_pad_seq_extended])
y_test_input = np.array([seq[:-1] for seq in y_test_padded_seq])
y_test_target = np.array([seq[1:] for seq in y_test_pad_seq_extended])

In [327]:
# y_train_input = np.array([seq[:-1] for seq in y_train_padded_seq])
# y_train_target = np.array([seq[1:] for seq in y_train_padded_seq])
# y_val_input = np.array([seq[:-1] for seq in y_val_padded_seq])
# y_val_target = np.array([seq[1:] for seq in y_val_padded_seq])
# y_test_input = np.array([seq[:-1] for seq in y_test_padded_seq])
# y_test_target = np.array([seq[1:] for seq in y_test_padded_seq])

In [328]:
y_train_input.shape

(732, 18)

In [329]:
y_train_target.shape

(732, 18)

In [330]:
y_train_target.reshape(y_train_target.shape[0],y_train_target.shape[1], 1).shape

(732, 18, 1)

In [331]:
y_train_target = y_train_target.reshape(y_train_target.shape[0],y_train_target.shape[1], 1)
y_test_target = y_test_target.reshape(y_test_target.shape[0],y_test_target.shape[1], 1)
y_val_target = y_val_target.reshape(y_val_target.shape[0],y_val_target.shape[1], 1)

In [332]:
y_train_target.shape

(732, 18, 1)

Now, lets save our tokenizers as a JSON file

In [333]:
import io, json

with io.open('X_tokenizer.json', 'w', encoding='utf-8') as tok:
    tok.write(json.dumps(X_tokenizer.to_json(), ensure_ascii=False))

with io.open('y_tokenizer.json', 'w', encoding='utf-8') as tok:
    tok.write(json.dumps(y_tokenizer.to_json(), ensure_ascii=False))

### Attentive Seq2Seq Model
- Encoder:
    - Embedding layer
    - Bidirectional LSTM
- Attention:
    - Bahdanau / Additive Attention
- Decoder:
    - Embedding layer
    - LSTM
    - Dense Softmax Layer

In [334]:
CONSTANTS["latent_dim_encoder"] = 256
CONSTANTS["latent_dim_decoder"] = 512
CONSTANTS["embedding_dim"] = 50
CONSTANTS["dropout"] = 0.3
CONSTANTS["epochs"] = 16
CONSTANTS["batch_size"] = 32

In [335]:
from tensorflow.keras.layers import Bidirectional, LSTM, Input, Dense, TimeDistributed, Embedding, Concatenate
from tensorflow.keras.models import Model

### <a href="https://arxiv.org/abs/1409.0473">Bahdanau Attention</a> also known as Additive Attention.

Working of our attention layer:
- Let the encoder overall hidden states be $h_j$, if the encoder is bidirectional then $h_j$ is the combination of both forward and backward hidden states.
- Also let the previous decoder hidden state be $s_{i-1}$
- First of all, these inputs are passed through a feed forward network also referred to as the alignment model in the original <a href="https://arxiv.org/abs/1409.0473">paper</a>
- $$ \tilde{\alpha_t} = align(h_j, s_{t-1}) $$
- $$ \tilde{\alpha_t} = V^T. tanh(W.s_{t-1} + U.h_j) $$
   - $\tilde{\alpha_t}$ is a number between $0$ and $1$
   - Large $\tilde{\alpha_t}$ means $h_j$ and $s_{i-1}$ are closely relevant
   - $W, U$ are simple dense layers whose size is equal to the latent dimension
   - $V$ is also a single unit layer used to reduce the dimension of the attention scores, converting it into a scalar $\tilde{\alpha_t}$
- Now these attention socres are converted into probability distribution by running it through a $Softmax$ layer
- $$ [\alpha_1, \alpha_2, ..., \alpha_m] = Softmax([\tilde{\alpha_1}, \tilde{\alpha_2}, ..., \tilde{\alpha_m}]) $$
   - $m =$ Size of enoder inputs
- Finally, the weighted sum of the $e_t$ vectors are computed to determine the context vector
 $$ context \ vector(c) =  \sum_{i=1}^{m}{\alpha_ie_i}$$

In [336]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttention, self).__init__(**kwargs)
        self.W = tf.keras.layers.Dense(units)
        self.U = tf.keras.layers.Dense(units)
        self.Wc = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, inputs):
        encoder_outputs, decoder_hidden_state, coverage_vector = inputs  # Select the output for the current time step
        
    
        # Calculate attention scores
#         W_s = self.W(tf.expand_dims(decoder_hidden_state, 1))
        
#         print("Ws>", W_s.shape)
        
#         U_h = self.U(encoder_outputs)
        
#         print("Uh>", U_h.shape)
        
#         Wc_ct = self.Wc(coverage_vector)
        
#         print("Ws_ct>", Wc_ct.shape)
        
#         Ws_plus_Uh_plus_Wcct = tf.nn.tanh(U_h + W_s + Wc_ct)
        
#         print("Ws+Uh+Ws_ct>", Ws_plus_Uh_plus_Wcct.shape)
        
        score = self.V(tf.nn.tanh(self.W(tf.expand_dims(decoder_hidden_state, 1)) + self.U(encoder_outputs) + self.Wc(coverage_vector)))
#         score = self.V(Ws_plus_Uh_plus_Wcct)
    
        # Calculate attention weights
        attention_weights = tf.nn.softmax(score, axis=1)
        
#         print("attention_weights>", attention_weights.shape)

        # Calculate the context vector
        context_vector = attention_weights * encoder_outputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
#         print("context_vector>", context_vector.shape)
        
#         print("coverage_vector before adding>", coverage_vector.shape)
        
        coverage_vector += attention_weights 
        
#         print("coverage_vector after adding>", coverage_vector.shape)
        
        return context_vector, attention_weights, coverage_vector

In [337]:
batch = CONSTANTS["batch_size"]
enc_input_len = 300
latent_dim = 256
dec_input_len = 21

con_v, a_w, cov_v = BahdanauAttention(256)([np.random.randn(batch, enc_input_len, latent_dim), np.random.randn(batch, latent_dim), np.zeros((batch, enc_input_len, 1))])

## Now we will define our Attentive Seq2Seq model

In [338]:
from keras import backend as K 
K.clear_session()

In [339]:
class Encoder(tf.keras.Model):
    def __init__(self, X_voc_size, embedding_dim, latent_dim, dropout_rate):
        super(Encoder, self).__init__()
                
        self.encoder_embedding = Embedding(X_voc_size, embedding_dim, trainable=True, name='News_Embedding')
        
        self.encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, dropout=dropout_rate, recurrent_dropout=dropout_rate, name='Encoder_BiLSTM'))
        
    def call(self, encoder_input):
        encoder_embedding = self.encoder_embedding(encoder_input)
        
        encoder_output, state_h_fwd, state_c_fwd, state_h_bwd, state_c_bwd = self.encoder_lstm(encoder_embedding)
        
        state_h = tf.concat([state_h_fwd, state_h_bwd], -1)
        state_c = tf.concat([state_c_fwd, state_c_bwd], -1)
        
        return encoder_output, state_h, state_c
        

In [340]:
batch = 1
enc_input_len = 6
dec_input_len = 3
embedding_dim = 4
dropout_rate = 0.4
X_voc_len = 17
latent_dim = 2
dropout_rate = 0.4
y_voc_len = 6

test_encoder = Encoder(X_voc_len, embedding_dim, latent_dim, dropout_rate)

In [341]:
tf.random.uniform((batch, enc_input_len)).shape

TensorShape([1, 6])

In [342]:
test_encoder_input = np.random.randint(1, X_voc_len, (batch, enc_input_len))

In [343]:
test_encoder_outputs, test_enc_state_h, test_enc_state_c = test_encoder(test_encoder_input)

In [344]:
print(test_encoder_outputs.shape)
print(test_enc_state_h.shape)
print(test_enc_state_c.shape)

(1, 6, 4)
(1, 4)
(1, 4)


In [345]:
class Decoder(tf.keras.Model):
    def __init__(self, y_voc_size, embedding_dim, latent_dim, dropout_rate):
        super(Decoder, self).__init__()
                
        self.decoder_embedding = Embedding(y_voc_size, embedding_dim, trainable=True, name='Title_Embedding')
        
        self.decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=dropout_rate, recurrent_dropout=dropout_rate, name='Decoder_LSTM')
        
        self.bahdanau_attention = BahdanauAttention(units=latent_dim, name="Bahdanau_Attention")
                
        self.decoder_dense = Dense(y_voc_size, activation='softmax', name="Softmax_Layer")
        
    def call(self, inputs):
        decoder_input, encoder_output, previous_states, coverage_vector = inputs
        
        decoder_embedding = self.decoder_embedding(decoder_input)
        
#         print("deco_emb>", decoder_embedding.shape)
        
        context_vector, attention_weights, coverage_vector = self.bahdanau_attention([encoder_output, previous_states[0], coverage_vector])
        
        decoder_input_with_context = tf.concat([tf.expand_dims(context_vector, 1), decoder_embedding], -1)
        
        decoder_output, state_h, state_c, = self.decoder_lstm(decoder_input_with_context, initial_state=previous_states)
        
        final_decoder_output = self.decoder_dense(tf.squeeze(decoder_output, 1))
        
        return decoder_embedding, final_decoder_output, state_h, state_c, context_vector, attention_weights, coverage_vector
        

In [346]:
test_decoder = Decoder(y_voc_len, embedding_dim, latent_dim*2, dropout_rate)

In [347]:
tf.random.uniform((batch, dec_input_len)).shape

TensorShape([1, 3])

In [348]:
tf.random.uniform((batch, dec_input_len))[:,1]

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.4441538], dtype=float32)>

In [349]:
next_decoder_inputs = tf.expand_dims(tf.random.uniform((batch, dec_input_len))[:,1], 1)
next_decoder_inputs

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.01996183]], dtype=float32)>

In [350]:
test_decoder_embedding, test_decoder_outputs, test_dec_state_h, test_dec_state_c, context_vec, attn_wts, coverage_v = test_decoder([next_decoder_inputs, test_encoder_outputs, [test_enc_state_h, test_enc_state_c], tf.zeros([batch, enc_input_len, 1])])

In [351]:
test_decoder_outputs.shape

TensorShape([1, 6])

In [352]:
test_enc_state_h.shape

TensorShape([1, 4])

In [353]:
print(test_decoder_outputs.shape)
print(attn_wts.shape)
print(coverage_v.shape)

(1, 6)
(1, 6, 1)
(1, 6, 1)


In [354]:
test_decoder_outputs[0]

<tf.Tensor: shape=(6,), dtype=float32, numpy=
array([0.16824193, 0.16571072, 0.16452573, 0.16756766, 0.16841315,
       0.16554074], dtype=float32)>

In [355]:
class PointerGenerator(tf.keras.Model):
    def __init__(self):
        super(PointerGenerator, self).__init__()
                
        self.p_gen_layer = Dense(1, activation='sigmoid', name="Pointer_Generator_Layer")
        
    def call(self, inputs):
        encoder_input, encoder_input_extended, decoder_embedding, decoder_output, decoder_hidden_state, context_vector, attention_weights = inputs
        
        y_voc_size = len(y_tokenizer.word_index) + 1
        
        combined = tf.concat([context_vector, decoder_hidden_state, tf.squeeze(decoder_embedding, 1)], -1)
        
        p_gen = tf.nn.sigmoid(self.p_gen_layer(combined))
        
#         print("p_gen>", p_gen.shape)
        
        p_gen = tf.clip_by_value(p_gen, 0.0001, 0.9999)
        
        P_vocab_weighted = p_gen * decoder_output
        
#         print("P_vocab_weighted>", P_vocab_weighted.shape)
        
        attention_weighted = (1 - p_gen) * tf.squeeze(attention_weights, 2)
        
#         attention_weighted_numpy = attention_weighted.numpy()
#         print("Attention weighted: ", attention_weighted)
        
#         print("attention_weighted>", attention_weighted.shape)
        
        extension = tf.zeros((encoder_input.shape[0], encoder_input.shape[1]))

        P_vocab_extended = tf.concat([P_vocab_weighted, extension], 1)
        
#         print("P_vocab_extended weighted: ", P_vocab_extended)
        
        batch_nums = tf.range(0, limit=encoder_input.shape[0]) # shape (batch_size)
        batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1)
        attn_len = tf.shape(attention_weighted)[1] # number of states we attend over
        batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len)
        indices = tf.stack((batch_nums, encoder_input_extended), axis=2) # shape (batch_size, enc_t, 2)
        shape = [encoder_input.shape[0], P_vocab_extended.shape[1]]
        P_copy_extended = tf.scatter_nd(indices, attention_weighted, shape) # list length max_dec_steps (batch_size, extended_vsize)
        
        final_distribution = P_copy_extended + P_vocab_extended

        return final_distribution

In [356]:
X_tok_word_index = {
    "a": 1,
    "b": 2,
    "c": 3,
    "d": 4,
    "e": 5,
    "f": 6,
    "g": 7,
    "h": 8,
    "i": 9,
    "j": 10,
    "k": 11,
    "l": 12,
    "m": 13,
    "n": 14,
    "o": 15,
    "p": 16
}

X_tok_index_word = {
    1:"a",
    2:"b",
    3:"c",
    4:"d",
    5:"e",
    6:"f",
    7:"g",
    8:"h",
    9:"i",
    10:"j",
    11:"k",
    12:"l",
    13:"m",
    14:"n",
    15:"o",
    16:"p"
}

y_tok = {
    "c": 1,
    "j": 2,
    "l": 3,
    "s": 4,
    "x": 5
}

In [357]:
test_pointer_generator = PointerGenerator()

In [358]:
encoder_input_extended = np.random.randint(1, y_voc_len + enc_input_len, (batch, enc_input_len), dtype=np.int32)

In [359]:
test_encoder_input

array([[ 1, 15,  7,  9, 10, 13]])

In [360]:
encoder_input_extended

array([[6, 3, 1, 6, 4, 6]], dtype=int32)

In [361]:
test_decoder_embedding

<tf.Tensor: shape=(1, 1, 4), dtype=float32, numpy=
array([[[-0.0277988 , -0.0141735 , -0.03498377,  0.04928545]]],
      dtype=float32)>

In [362]:
test_decoder_outputs

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[0.16824193, 0.16571072, 0.16452573, 0.16756766, 0.16841315,
        0.16554074]], dtype=float32)>

In [363]:
attn_wts

<tf.Tensor: shape=(1, 6, 1), dtype=float32, numpy=
array([[[0.16823252],
        [0.16465549],
        [0.16665629],
        [0.16629013],
        [0.16635872],
        [0.1678069 ]]], dtype=float32)>

In [364]:
print("Final: ", test_pointer_generator([test_encoder_input, encoder_input_extended,  test_decoder_embedding, test_decoder_outputs, test_dec_state_h, context_vec, attn_wts]))

Final:  tf.Tensor(
[[0.10450219 0.16606894 0.10219391 0.16646436 0.16763481 0.10282437
  0.19031137 0.         0.         0.         0.         0.        ]], shape=(1, 12), dtype=float32)


In [365]:
test_a = np.random.randint(1, 20, (2, 8))
test_b = np.random.randint(1, 20, (2, 8))

In [366]:
tf.reduce_sum(tf.minimum(test_a, test_b), [1])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([46, 47])>

In [367]:
test_res = tf.reduce_sum(tf.minimum(test_a, test_b))
test_res

<tf.Tensor: shape=(), dtype=int64, numpy=93>

In [368]:
test_target = np.random.randint(3, size=(10, 1))
test_target

array([[1],
       [0],
       [0],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1]])

In [369]:
mask = tf.cast(tf.math.not_equal(test_target, 0), tf.float32)
mask

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]], dtype=float32)>

In [370]:
test_coverage_vec = np.random.randn(4, 5)
test_coverage_vec

array([[-1.37808842,  1.62636994, -2.43422622,  0.42757858,  0.11868462],
       [ 0.73697859, -0.81789797,  0.92324399, -0.14057225,  1.26533728],
       [-0.08133002,  1.06554592, -0.34249313,  0.35464897,  0.48775159],
       [-0.91787735, -0.18196523,  0.99520668,  1.82849323, -0.7784003 ]])

In [371]:
test_target = np.random.randint(3, size=(4, 1))
test_target

array([[2],
       [2],
       [0],
       [2]])

In [372]:
test_coverage_mask = tf.cast(tf.math.not_equal(test_target, 0), tf.float32)
test_coverage_mask

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[1.],
       [1.],
       [0.],
       [1.]], dtype=float32)>

In [373]:
test_coverage_mask * test_coverage_vec

<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[-1.3780885 ,  1.62637   , -2.4342263 ,  0.42757857,  0.11868462],
       [ 0.7369786 , -0.817898  ,  0.923244  , -0.14057225,  1.2653372 ],
       [-0.        ,  0.        , -0.        ,  0.        ,  0.        ],
       [-0.9178774 , -0.18196523,  0.9952067 ,  1.8284932 , -0.7784003 ]],
      dtype=float32)>

In [374]:
def sparse_categorical_and_coverage_loss(targets, output_dist, attn_wts, coverage, _lambda=1):
    sccel_loss = tf.keras.losses.SparseCategoricalCrossentropy()
    mask = tf.cast(tf.math.not_equal(targets, 0), tf.float32)
    sccel = sccel_loss(targets, output_dist, sample_weight=mask)
    
    attn_wts = tf.squeeze(attn_wts, 2)
    coverage = tf.squeeze(coverage, 2)
    
    coverage = coverage * mask
    attn_wts = attn_wts * mask
    
    coverage_loss = tf.reduce_sum(tf.minimum(attn_wts, coverage)) / output_dist.shape[0]
    
    return sccel + _lambda * coverage_loss, coverage_loss

In [375]:
class HeadlineGeneratorTrainer(tf.keras.Model):
    def __init__(self, encoder, decoder, pointer_generator, _lambda=1):
        super(HeadlineGeneratorTrainer, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.pointer_generator = pointer_generator
        self._lambda = _lambda

    # This method will be called by model.fit for each batch.
    @tf.function
    def train_step(self, inputs):
        loss = 0.
        coverage_loss = 0.
        
        encoder_input_seq, encoder_input_seq_extended, decoder_input_seq, decoder_target_seq = inputs

        with tf.GradientTape() as tape:
            encoder_output_seq, state_h, state_c = self.encoder(encoder_input_seq, training=False)
            
            coverage_vector = tf.zeros([encoder_input_seq.shape[0], encoder_input_seq.shape[1], 1])
            
            attention_weights_over_all_timesteps = None
            context_vectors_over_all_timesteps = None
            
            # We need to create a loop to iterate through the target sequences
            for i in range(decoder_target_seq.shape[1]):

                # Input to the decoder must have shape of (batch_size, length)
                # so we need to expand one dimension (just like in the previous example).
                next_decoder_input = tf.expand_dims(decoder_input_seq[:, i], 1)
                decoder_embedding, decoder_dist, state_h, state_c, context_vector, attention_weights, coverage_vector_next_t = self.decoder(
                    [next_decoder_input, encoder_output_seq, [state_h, state_c], coverage_vector], training=False)
                if attention_weights_over_all_timesteps == None:
                    attention_weights_over_all_timesteps = tf.expand_dims(tf.squeeze(attention_weights, 2), 1)
                else:
                    tf.concat([attention_weights_over_all_timesteps, tf.expand_dims(tf.squeeze(attention_weights, 2), 1)], 1)
                    
                if context_vectors_over_all_timesteps == None:
                    context_vectors_over_all_timesteps = tf.expand_dims(context_vector, 1)
                else:
                    tf.concat([context_vectors_over_all_timesteps, tf.expand_dims(context_vector, 1)], 1)
                    
                final_dist = pointer_generator([encoder_input_seq, encoder_input_seq_extended, decoder_embedding, decoder_dist, state_h, context_vector, attention_weights], training=True)
                
                final_dist = tf.nn.softmax(final_dist, -1)

                # The loss is now accumulated through the whole batch
                overall_loss, cov_loss = self.loss(decoder_target_seq[:, i], final_dist, attention_weights, coverage_vector, self._lambda)
                loss += overall_loss
                coverage_loss += cov_loss
                coverage_vector = coverage_vector_next_t

                # Update the parameters and the optimizer
        variables = encoder.trainable_variables + decoder.trainable_variables + pointer_generator.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))

        return {'loss': loss / decoder_target_seq.shape[1], 'coverage_loss': coverage_loss /  decoder_target_seq.shape[1]}
    
    @tf.function
    def test_step(self, inputs):
        loss = 0.
        coverage_loss = 0.
        
        encoder_input_seq, encoder_input_seq_extended, decoder_input_seq, decoder_target_seq = inputs

        encoder_output_seq, state_h, state_c = self.encoder(encoder_input_seq, training=False)
        
        coverage_vector = tf.zeros([encoder_input_seq.shape[0], encoder_input_seq.shape[1], 1])

        attention_weights_over_all_timesteps = None
        context_vectors_over_all_timesteps = None

        # We need to create a loop to iterate through the target sequences
        for i in range(decoder_target_seq.shape[1]):

            # Input to the decoder must have shape of (batch_size, length)
            # so we need to expand one dimension (just like in the previous example).
            next_decoder_input = tf.expand_dims(decoder_input_seq[:, i], 1)
            decoder_embedding, decoder_dist, state_h, state_c, context_vector, attention_weights, coverage_vector_next_t = self.decoder(
                [next_decoder_input, encoder_output_seq, [state_h, state_c], coverage_vector], training=False)
            if attention_weights_over_all_timesteps == None:
                attention_weights_over_all_timesteps = tf.expand_dims(tf.squeeze(attention_weights, 2), 1)
            else:
                tf.concat([attention_weights_over_all_timesteps, tf.expand_dims(tf.squeeze(attention_weights, 2), 1)], 1)

            if context_vectors_over_all_timesteps == None:
                context_vectors_over_all_timesteps = tf.expand_dims(context_vector, 1)
            else:
                tf.concat([context_vectors_over_all_timesteps, tf.expand_dims(context_vector, 1)], 1)

            final_dist = pointer_generator([encoder_input_seq, encoder_input_seq_extended, decoder_embedding, decoder_dist, state_h, context_vector, attention_weights], training=False)

            final_dist = tf.nn.softmax(final_dist, -1) 

            # The loss is now accumulated through the whole batch
            overall_loss, cov_loss = self.loss(decoder_target_seq[:, i], final_dist, attention_weights, coverage_vector, self._lambda)
            loss += overall_loss
            coverage_loss += cov_loss
            coverage_vector = coverage_vector_next_t


        return {'loss': loss / decoder_target_seq.shape[1], 'coverage_loss': coverage_loss /  decoder_target_seq.shape[1]}

In [376]:
CONSTANTS["lambda"] = 1
CONSTANTS

{'max_title_length': 19,
 'max_news_length': 512,
 'news_voc_size': 26454,
 'title_voc_size': 3320,
 'latent_dim_encoder': 256,
 'latent_dim_decoder': 512,
 'embedding_dim': 50,
 'dropout': 0.3,
 'epochs': 16,
 'batch_size': 32,
 'lambda': 1}

In [378]:
encoder = Encoder(CONSTANTS["news_voc_size"], CONSTANTS["embedding_dim"], CONSTANTS["latent_dim_encoder"], CONSTANTS["dropout"])
decoder = Decoder(CONSTANTS["title_voc_size"], CONSTANTS["embedding_dim"], CONSTANTS["latent_dim_decoder"], CONSTANTS["dropout"])
pointer_generator = PointerGenerator()
optimizer = tf.keras.optimizers.Adam()

headline_generator_trainer = HeadlineGeneratorTrainer(encoder, decoder, pointer_generator)
headline_generator_trainer.compile(optimizer=optimizer, loss=sparse_categorical_and_coverage_loss)

In [379]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_pad_seq, 
                                              X_train_pad_seq_extended,
                                              y_train_input, 
                                              y_train_target)).batch(CONSTANTS["batch_size"], drop_remainder=True)

In [380]:
X_train_pad_seq.shape

(732, 512)

In [381]:
X_train_pad_seq_extended.shape

(732, 512)

In [382]:
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_pad_seq, 
                                              X_val_pad_seq_extended,
                                              y_val_input, 
                                              y_val_target)).batch(1, drop_remainder=True)

In [384]:
# import time
# for epoch in range(CONSTANTS["epochs"]):
#     print("\nStart of epoch %d" % (epoch,))
#     start_time = time.time()

#     num_batches = 0
#     train_loss = 0.
#     train_coverage_loss = 0.
    
#     # Iterate over the batches of the dataset.
#     for step, (X_train_seq, X_train_seq_extended, y_train_input, y_train_target) in enumerate(train_dataset):
#         losses = headline_generator_trainer.train_step([X_train_seq, X_train_seq_extended, y_train_input, y_train_target])
#         train_loss += losses["loss"]
#         train_coverage_loss += losses["coverage_loss"]
#         num_batches += 1

#     print(f"train_loss: {train_loss / num_batches} train_coverage_loss: {train_coverage_loss / num_batches}")

#     val_loss = 0.
#     val_coverage_loss = 0.
#     num_batches = 0
#     # Run a validation loop at the end of each epoch.
#     for X_train_seq, X_train_seq_extended, y_train_input, y_train_target in val_dataset:
#         losses = headline_generator_trainer.test_step([X_train_seq, X_train_seq_extended, y_train_input, y_train_target])
#         train_loss += losses["loss"]
#         train_coverage_loss += losses["coverage_loss"]
#         num_batches += 1

#     print(f"val_loss: {val_loss / num_batches} val_coverage_loss: {val_coverage_loss / num_batches}")


In [None]:
CONSTANTS["patience"] = 3

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=CONSTANTS["patience"])
cp = tf.keras.callbacks.ModelCheckpoint("/kaggle/working/NepaliNewsClassifier", monitor="val_loss", mode="min", save_best_only=True)

In [None]:
# headline_generator_trainer.fit(dataset, epochs=CONSTANTS["epochs"])
history = headline_generator_trainer.fit(train_dataset, epochs=CONSTANTS["epochs"], validation_data=val_dataset, callbacks=[es, cp])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 4/22 [====>.........................] - ETA: 2:57 - loss: 3.8363 - coverage_loss: 0.2233

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation Loss vs. Epochs')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['val_coverage_loss'])
plt.title('Coverage loss')
plt.ylabel('Training and Validation Coverage Loss vs. Epochs')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## SEPERATE

In [None]:
# def summarize_with_beam_search(news_sequence, X_tokenizer, y_tokenizer, beam_width=3, alpha=0.8, sos_token='sos', eos_token='eos', max_title_length=21):

#     encoder_output, h_t_minus_1, c_t_minus_1 = inf_encoder_model.predict(news_sequence)
    
#     # Initialize the beam of sequences.
#     # Initially we start with the SOS_TOKEN
#     beam = [[0.0, [y_tokenizer.word_index[sos_token]]]]
    
#     # Decode the sequence one token at a time.
#     for i in range(max_title_length):

#         # Expand the beam.
#         expanded_beam = []
#         for log_probability_score, sequence in beam:
#             if (sequence[-1] != y_tokenizer.word_index[eos_token]):
                
#                 # Generate empty target sequence of length 1.
#                 word_n_minus_1 = np.zeros((1,1))
    
#                 # Populate the first word of target sequence with the start word.
#                 word_n_minus_1[0, 0] = sequence[-1]
                
#                 decoder_output, h_t, c_t = inf_decoder_model.predict([word_n_minus_1, encoder_output, h_t_minus_1, c_t_minus_1])

#                 prob_dist = decoder_output[0, -1, :]

#                 # Generate all possible next tokens for the sequence.
#                 for word_n_index in range(len(prob_dist)):
#                     expanded_beam.append([log_probability_score + np.log(prob_dist[word_n_index]), sequence + [word_n_index]])

#         # Prune the beam to get the top-K
#         beam = sorted(expanded_beam, key=lambda x: x[0], reverse=True)[:beam_width]
        
#         # Check if all of the top-K sequences have encountered the EOS token.
#         # Or all of the top-K sequences have length > max_title_length
#         if all(sequence[-1] == y_tokenizer.word_index[eos_token] for prob, sequence in beam):
#             ''' This section indicates the top-K sequences has been generated '''
#             ''' Finally, we perform length normalization on the log proability score of each sequence before exiting '''
#             for i in range(len(beam)):
#                 beam[i][1] = beam[i][1][1:] # Remove the SOS_TOKEN from the start
#                 beam[i][0] /= (len(beam[i][1])**alpha) # Perform length normalization       
#             beam = sorted(beam, key=lambda x: x[0], reverse=True)
#             break

#         # Update the internal states for the next time step t+1.
#         h_t_minus_1, c_t_minus_1 = h_t, c_t

#     # Return the sequence with the highest score from the beam as sentence.
#     return sequence_to_words(beam[0][1], y_tokenizer)

### Training using Early Stopping

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# CONSTANTS["patience"] = 5
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=CONSTANTS["patience"])

In [None]:
# import pickle

# with open('/kaggle/working/train_history.pkl', 'wb') as hist:
#     pickle.dump(history.history, hist)

In [None]:
# model.evaluate([X_test_pad_seq, y_test_input], y_test_target)

Finally, we save our model and constant variables for future usage

In [None]:
# # Convert and write JSON object to file
# with open("/kaggle/working/constants.json", "w") as const: 
#     json.dump(CONSTANTS, const)

In [None]:
# model.save('Nepali_News_Headline_Gen_Model')

In [None]:
# !zip -r model_ouput.zip /kaggle/working