# Importing the basic libaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Accessing the dataset from google drive

In [None]:
path = "/kaggle/input/summarization-set/full_cleaned.csv"

# Reading the dataset

In [None]:
df = pd.read_csv(path)
df.head()
original_dataset_size = len(df["news"])

- The dataset has already been made free of english words, arabic numerals, emojis, and special characters and null values.<br>
- Further, stopwords has been removed from the news articles while they have been kept as it is in the headlines since headlines are already very short and the stopwords are integral in keeping the headline concise.

In [None]:
print(df.isnull().sum())

#### Now lets look remove any duplicate news articles

In [None]:
print("Size of dataset:")
print(f"Before removing duplicates: {len(df['title'])}")
df.drop_duplicates(subset=['title', 'news'], inplace=True)
print(f"After removing duplicates: {len(df['title'])}")

#### Check for any data with empty title or news

In [None]:
df[df["news"] == ""]

In [None]:
df[df["title"] == ""]

Now that we have completed our data preprocessing part, lets analyze the length of news and titles

This is necessary because both news and titles are of variable length. Some are too big while some are too small. <br>

Our network only accepts fixed sized inputs. So we need to create a threshold of how many words to take from each news and heading to train our model.<br>

#### Computing the length of news and titles

In [None]:
df["title_length"] = df["title"].apply(lambda x: len(x.split()))
df["news_length"] = df["news"].apply(lambda x: len(x.split()))

#### Analyzing titles

In [None]:
df["title_length"].describe()

In [None]:
df["title_length"].hist(bins=20)

The distribution of the length of titles is similar to the Normal Distribution.

Lets look at the % of titles whose length is $<=x $,  where $x \in [8, 15]$

In [None]:
total_data = len(df["title"])
x = [f"<={i}" for i in range(8, 16)]
y = [len(df[df["title_length"] <= i]["title"]) / total_data * 100 for i in range(8, 16)]

In [None]:
sns.barplot(x=x, y=y)

In [None]:
list(zip(x, y))

In [None]:
len(df[df["title_length"] == 1]["news"])

In [None]:
len(df[df["title_length"] == 2]["news"])

Lets remove the articles where the title only consist of a single word

In [None]:
print("Size of dataset: ")
print("Before removing articles with a single word title: ", len(df["news"]))
df = df[df["title_length"] > 2]
print("After removing articles with a single word title: ", len(df["news"]))

So 12 or 13 seems like a good choice for the maximum length of titles to train our model on since more than 95% of our data has title of length less than or equal to 12, 13.<br>

However, we will choose 15


#### Analyzing news

In [None]:
df["news_length"].describe()

Now lets remove any news whose length is less than 10

In [None]:
len(df["news"])

In [None]:
df[df["news_length"] < 36].head()

In [None]:
print(len(df[df["news_length"] < 30]))

In [None]:
print(len(df[df["news_length"] < 30]) / len(df["news_length"]) * 100)
print(len(df[df["news_length"] >= 30]) / len(df["news_length"]) * 100)

Our model will be able to learn better if we remove small news articles

Lets check the % of articles that are less

In [None]:
print("Dataset size:")
print("Before removing short news: ", len(df["news"]))
print("Instances to be removed: ", len(df[df["news_length"] < 30]["news"]))
df = df[df["news_length"] >= 30]
print("After removing short news: ", len(df["news"]))

In [None]:
len(df["news"])

In [None]:
total_data = len(df["news"])
x = [f"<={i}" for i in range(200, 701, 10)]
y = [len(df[df["news_length"] <= i]["news"]) / total_data * 100 for i in range(200, 701, 10)]

In [None]:
list(zip(x, y))

Since we have limited resources to train our model, so we will use the first 120 words from the news to predict the title

In [None]:
df[df["news_length"] > 400]["news"].count()

In [None]:
df[df["news_length"] > 350]["news"].count()

In [None]:
print("Dataset size:")
print("Before removing long news: ", len(df["news"]))
print("Instances to be removed: ", len(df[df["news_length"] > 400]["news"]))
df = df[df["news_length"] <= 400]
print("After removing long news: ", len(df["news"]))

In [None]:
print("Size of original dataset:", original_dataset_size)
print("Size of dataset after removing cleaning:", len(df["news"]))
print("% of dataset removed:", (original_dataset_size - len(df["news"])) / original_dataset_size * 100)
print("% of datset remaining:", len(df["news"]) / original_dataset_size * 100)

In [None]:
CONSTANTS = {}

In [None]:
CONSTANTS["max_news_length"] = 300
CONSTANTS["max_title_length"] = 21

#### Now we remove the extra part of the news and headlines so that all of them have length of $<=$ their cooresponding allowed length

In [None]:
df["title_cut"] = df["title"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_title_length"]]))
df["news_cut"] = df["news"].apply(lambda x: " ".join(x.split()[:CONSTANTS["max_news_length"]]))

##### Now lets add the start and end token to our headlines

In [None]:
df["title_cut"] = df["title_cut"].apply(lambda x: 'sos ' + x + ' eos')

In [None]:
df.head()

Now lets split our dataset into train, test and validation set.<br>
- 80% of the data will be used to train our model.
- 10% will be used for validation
- The final 10% will be used to test our model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.info()

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(df.drop(['title', 'title_cut', 'title_length', 'news_length', 'category'], axis=1), df.drop(['news', 'news_cut', 'title_length', 'news_length', 'category'], axis=1), test_size=0.2, random_state=21, shuffle=True)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=21, shuffle=True)

In [None]:
X_val.head()

In [None]:
print(f"Length of dataset: {len(df['title'])}, in %: {len(df['title']) / len(df['title']) * 100}")
print(f"Length of training set: {len(X_train['news'])}, in %: {len(X_train['news']) / len(df['title']) * 100}")
print(f"Length of validation set: {len(X_val['news'])}, in %: {len(X_val['news']) / len(df['title']) * 100}")
print(f"Length of test set: {len(X_test['news'])}, in %: {len(X_test['news']) / len(df['title']) * 100}")

#### Now lets build our vocabulary  and convert our words to integers

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### First we build the vocabulary for the news

In [None]:
X_tokenizer = Tokenizer()
X_tokenizer.fit_on_texts(list(X_train["news_cut"]))

In [None]:
len(X_tokenizer.word_index) + 1

In [None]:
count = 0
for key, value in X_tokenizer.word_counts.items():
    if value < 2:
        count += 1
print(count)

In [None]:
# X_tokenizer.word_index

Now we convert our words to integers

In [None]:
X_train_seq = X_tokenizer.texts_to_sequences(X_train["news_cut"])
X_val_seq = X_tokenizer.texts_to_sequences(X_val["news_cut"])
X_test_seq = X_tokenizer.texts_to_sequences(X_test["news_cut"])

Finally we will pad our sequences so that all the inputs are of same length

In [None]:
# For news whose length is less than 300, we pad zero at end to make it's length equal 300
X_train_pad_seq = pad_sequences(X_train_seq,  maxlen=CONSTANTS["max_news_length"], padding='post')
X_val_pad_seq = pad_sequences(X_val_seq, maxlen=CONSTANTS["max_news_length"], padding='post')
X_test_pad_seq = pad_sequences(X_test_seq, maxlen=CONSTANTS["max_news_length"], padding='post')

In [None]:
X_train_pad_seq[:5]

In [None]:
np.array(X_train_pad_seq).shape

In [None]:
X_train_pad_seq[0].shape

In [None]:
X_train_pad_seq[0].reshape(1, 300).shape

In [None]:
X_train_pad_seq[0].reshape(1, 300)

We compute the size of our news vocabulary and store it

In [None]:
X_voc_size = len(X_tokenizer.word_index) + 1
X_voc_size

#### Now for the headlines

We will perform the same operations as the news. However, we will convert our sequences to appropriate format for teacher forcing before padding them.

In [None]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(y_train["title_cut"])

In [None]:
y_train_seq = y_tokenizer.texts_to_sequences(y_train["title_cut"])
y_val_seq = y_tokenizer.texts_to_sequences(y_val["title_cut"])
y_test_seq = y_tokenizer.texts_to_sequences(y_test["title_cut"])

## Teacher Forcing

Now we will convert our sequences to appropriate format for teacher forcing

In [None]:
# For news whose length is less than 21, we pad zero at end to make it's length equal to 21
y_train_padded_seq = pad_sequences(y_train_seq,  maxlen=CONSTANTS["max_title_length"], padding='post')
y_val_padded_seq = pad_sequences(y_val_seq, maxlen=CONSTANTS["max_title_length"], padding='post')
y_test_padded_seq = pad_sequences(y_test_seq, maxlen=CONSTANTS["max_title_length"], padding='post')

In [None]:
y_train_input = np.array([seq[:-1] for seq in y_train_padded_seq])
y_train_target = np.array([seq[1:] for seq in y_train_padded_seq])
y_val_input = np.array([seq[:-1] for seq in y_val_padded_seq])
y_val_target = np.array([seq[1:] for seq in y_val_padded_seq])
y_test_input = np.array([seq[:-1] for seq in y_test_padded_seq])
y_test_target = np.array([seq[1:] for seq in y_test_padded_seq])

In [None]:
y_train_input.shape

In [None]:
y_train_target.shape

In [None]:
y_train_target.reshape(y_train_target.shape[0],y_train_target.shape[1], 1).shape

In [None]:
y_train_target = y_train_target.reshape(y_train_target.shape[0],y_train_target.shape[1], 1)
y_test_target = y_test_target.reshape(y_test_target.shape[0],y_test_target.shape[1], 1)
y_val_target = y_val_target.reshape(y_val_target.shape[0],y_val_target.shape[1], 1)

In [None]:
y_train_target.shape

We compute the size of our headline vocabulary and store it

In [None]:
y_voc_size = len(y_tokenizer.word_index) + 1
y_voc_size

In [None]:
y_train_input[:2]

In [None]:
y_train_target[:2]

In [None]:
y_train_target.reshape(y_train_target.shape[0],y_train_target.shape[1], 1)[0]

In [None]:
y_train_target.shape

In [None]:
y_train_target.reshape(y_train_target.shape[0],y_train_target.shape[1], 1)[0]

Now, lets save our tokenizers as a JSON file

In [None]:
import io, json

with io.open('X_tokenizer.json', 'w', encoding='utf-8') as tok:
    tok.write(json.dumps(X_tokenizer.to_json(), ensure_ascii=False))

with io.open('y_tokenizer.json', 'w', encoding='utf-8') as tok:
    tok.write(json.dumps(y_tokenizer.to_json(), ensure_ascii=False))

### Attentive Seq2Seq Model
- Encoder:
    - Embedding layer
    - Bidirectional LSTM
    - Bidirectional LSTM
    - Bidirectional LSTM
- Attention:
    - Bahdanau / Additive Attention
- Decoder:
    - Embedding layer
    - LSTM
    - Dense Softmax Layer

In [None]:
CONSTANTS["latent_dim"] = 300
CONSTANTS["embedding_dim"] = 100
CONSTANTS["dropout"] = 0.3
CONSTANTS["epochs"] = 10
CONSTANTS["batch_size"] = 64

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Input, Dense, TimeDistributed, Embedding, Concatenate
from tensorflow.keras.models import Model

### <a href="https://arxiv.org/abs/1409.0473">Bahdanau Attention</a> also known as Additive Attention.

Working of our attention layer:
- Let the encoder and decoder hidden states be $e_t$ and $d_t$ at timestep $t$
- First of all, these inputs are passed through a feed forward network also referred to as the alignment model in the original <a href="https://arxiv.org/abs/1409.0473">paper</a>
- $$ \tilde{\alpha_t} = align(e_t, d_t) $$
- $$ \tilde{\alpha_t} = V^T. tanh(W.e_t + U.d_t) $$
   - $\tilde{\alpha_t}$ is a number between $0$ and $1$
   - Large $\tilde{\alpha_t}$ means $e_t$ and $d_t$ are closely relevant
   - $W, U$ are simple dense layers whose size is equal to the latent dimension
   - $V$ is also a single unit layer used to reduce the dimension of the attention scores, converting it into a scalar $\tilde{\alpha_t}$
- Now these attention socres are converted into probability distribution by running it through a $Softmax$ layer
- $$ [\alpha_1, \alpha_2, ..., \alpha_m] = Softmax([\tilde{\alpha_1}, \tilde{\alpha_2}, ..., \tilde{\alpha_m}]) $$
   - $m =$ Size of enoder inputs
- Finally, the weighted sum of the $e_t$ vectors are computed to determine the context vector
 $$ context \ vector(c) =  \sum_{i=1}^{m}{\alpha_ie_i}$$

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer

class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>', U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)
            if verbose:
                print('Ws+Uh>', Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(BahdanauAttention, self).__init__(**kwargs)
        self.W = tf.keras.layers.Dense(units)
        self.U = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, encoder_outputs, decoder_outputs):
        context_vectors = []
        attention_weights = []

        for t in range(decoder_outputs.shape[1]):
            decoder_output_t = decoder_outputs[:, t:t+1, :]  # Select the output for the current time step

            W_s = self.W(encoder_outputs)
            U_h = self.U(decoder_output_t)
            Ws_plus_Uh = tf.nn.tanh(W_s + U_h)
            score = self.V(Ws_plus_Uh)

            attention_weight = tf.nn.softmax(score, axis=1)

            context_vector = attention_weight * encoder_outputs

            context_vector = tf.reduce_sum(context_vector, axis=1)

            attention_weight = tf.squeeze(attention_weight, axis=-1)

            context_vectors.append(context_vector)
            attention_weights.append(attention_weight)
        
        context_vectors_reshaped = tf.transpose(context_vectors, [1, 0, 2])
        attention_weights_reshaped = tf.transpose(attention_weights, [1, 0, 2])

        return context_vectors_reshaped, attention_weights_reshaped

## Now we will define our Attentive Seq2Seq model

In [None]:
from keras import backend as K 
K.clear_session()

In [None]:
## ENCODER ##

encoder_inputs = Input(shape=(None,), name='Encoder_Input')

# Embedding layer
encoder_embedding = Embedding(X_voc_size, CONSTANTS["embedding_dim"], trainable=True, name='News_Embedding')(encoder_inputs)

# Encoder Bidirectional LSTM 1
encoder_lstm1 = LSTM(CONSTANTS["latent_dim"], return_sequences=True, return_state=True, dropout=CONSTANTS["dropout"], name='Encoder_LSTM_1')
encoder_output1, _, _, = encoder_lstm1(encoder_embedding)

# Encoder Bidirectional LSTM 2
encoder_lstm2 = LSTM(CONSTANTS["latent_dim"], return_sequences=True, return_state=True, dropout=CONSTANTS["dropout"], name='Encoder_LSTM_2')
encoder_output2, _, _, = encoder_lstm2(encoder_output1)

# Encoder Bidirectional LSTM 3
encoder_lstm3 = LSTM(CONSTANTS["latent_dim"], return_sequences=True, return_state=True, dropout=CONSTANTS["dropout"], name='Encoder_LSTM_3')
encoder_output3, state_h3, state_c3 = encoder_lstm3(encoder_output2)

# # Concatenate the forward and backward hidden and cell states
# state_h3 = Concatenate()([forw_state_h3, back_state_h3])
# state_c3 = Concatenate()([forw_state_c3, back_state_c3])

# Set up the encoder final states which will be used to initialize the decoder
encoder_final_states = [state_h3, state_c3]

## DECODER ##

decoder_inputs = Input(shape=(None,), name='Decoder_Input')

# Embedding layer
decoder_embedding = Embedding(y_voc_size, CONSTANTS["embedding_dim"], trainable=True, name='Title_Embedding')(decoder_inputs)

# Decoder LSTM 1
decoder_lstm = LSTM(CONSTANTS["latent_dim"], return_sequences=True, return_state=True, dropout=CONSTANTS["dropout"], name='Decoder_LSTM')
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_final_states)

## ATTENTION LAYER ##

bahdanau_attention = AttentionLayer(name='Bahdanau_Attention')
context_vectors, _ = bahdanau_attention([encoder_output3, decoder_output])

# Concatenate the Attention and Decoder LSTM output to feed it into the Dense Softmax Layer
decoder_concat_input = Concatenate(name='Concatenate_Layer', axis=-1)([decoder_output, context_vectors])

# Dense Softmax Layer
decoder_dense =  TimeDistributed(Dense(y_voc_size, activation='softmax', name='Softmax_Layer'), name='Softmax_Layer')
decoder_output = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_output)

model.summary()

Visualizing our model

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='nepali_news_headline_generation_model.png', show_shapes=True, show_layer_names=True)

### Model Compilation

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Training using Early Stopping

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
CONSTANTS["patience"] = 5
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=CONSTANTS["patience"])

No early stopping

In [None]:
history = model.fit([X_train_pad_seq, y_train_input], y_train_target, epochs=CONSTANTS["epochs"], batch_size=CONSTANTS["batch_size"], validation_data=([X_val_pad_seq, y_val_input], y_val_target))

With Early Stopping

history = model.fit([X_train_pad_seq, y_train_input], y_train_target, epochs=CONSTANTS["epochs"], callbacks=[es], batch_size=CONSTANTS["batch_size"], validation_data=([X_val_pad_seq, y_val_input], y_val_target))

In [None]:
import pickle

with open('/kaggle/working/train_history.pkl', 'wb') as hist:
    pickle.dump(history.history, hist)

In [None]:
model.evaluate([X_test_pad_seq, y_test_input], y_test_target)

Finally, we save our model and constant variables for future usage

In [None]:
# Convert and write JSON object to file
with open("/kaggle/working/constants.json", "w") as const: 
    json.dump(CONSTANTS, const)

In [None]:
model.save('Nepali_News_Headline_Gen_Model')

In [None]:
!zip -r model_ouput.zip /kaggle/working