# Text Summarization

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
# from wordcloud import WordCloud, STOPWORDS

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from attention import AttentionLayer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv('Reviews.csv', nrows=100000)
df.head()

In [None]:
df.shape

### Preprocess

In [None]:
df.drop_duplicates(subset=['Text'], inplace=True)
df.dropna(axis=0, inplace=True) 

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in text.split() if not w in stop_words]
    final_text = []
    for i in tokens:
        if len(i) >= 3:
            final_text.append(i)  
    return (" ".join(final_text)).strip()

def clean_text(text):
    text = text.lower()
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"', '', text)
    text = re.sub(r"'s\b", '', text)
    text = re.sub("[^a-zA-Z]", ' ', text) 
    text = remove_stopwords(text)
    return text

In [None]:
df['Text'] = df['Text'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

In [None]:
df.replace('', np.nan, inplace=True)
df.dropna(axis=0, inplace=True)

In [None]:
len_df = pd.DataFrame(columns=['Text Length', 'Summary Length'])

len_df['Text Length'] = df['Text'].apply(lambda x: len(str(x).split()))
len_df['Summary Length'] = df['Summary'].apply(lambda x: len(str(x).split()))

In [None]:
_, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
sns.histplot(len_df['Text Length'], label='Count', bins=30, ax=ax1)
sns.histplot(len_df['Summary Length'], label='Count', bins=30, ax=ax2)

In [None]:
df['Summary'] = df['Summary'].apply(lambda x : '_START_ '+ x + ' _END_')

### Tokenizers

In [None]:
X = df['Text']
y = df['Summary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [None]:
max_text_len = 80
max_summary_len = 10
padding_type='post'

X_tokenizer = Tokenizer()
X_tokenizer.fit_on_texts(X_train)

X_train_seq = X_tokenizer.texts_to_sequences(X_train) 
X_train_padded = pad_sequences(X_train_seq,  maxlen=max_text_len, padding=padding_type) 

X_test_seq = X_tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_text_len, padding=padding_type)

In [None]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(y_train)

y_train_seq = y_tokenizer.texts_to_sequences(y_train) 
y_train_padded = pad_sequences(y_train_seq,  maxlen=max_summary_len, padding=padding_type) 

y_test_seq = y_tokenizer.texts_to_sequences(y_test)
y_test_padded = pad_sequences(y_test_seq, maxlen=max_summary_len, padding=padding_type)

In [None]:
X_train = X_train_padded
y_train = y_train_padded
X_test = X_test_padded
y_test = y_test_padded

### Training

In [None]:
embedding_dim = 500 
x_voc_size = len(X_tokenizer.word_index) + 1
y_voc_size = len(y_tokenizer.word_index) +1

# Encoder 
encoder_inputs = Input(shape=(max_text_len,)) 
enc_emb = Embedding(x_voc_size, embedding_dim, trainable=True)(encoder_inputs) 

encoder_lstm1 = LSTM(embedding_dim, return_sequences=True, return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

encoder_lstm2 = LSTM(embedding_dim, return_sequences=True, return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

encoder_lstm3=LSTM(embedding_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

# Decoder
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(y_voc_size, embedding_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True) 
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c]) 

# Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])
decoder_dense = TimeDistributed(Dense(y_voc_size, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input) 

model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [None]:
history=model.fit(
    [X_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:,1:], 
    validation_data=([X_test, y_test[:,:-1]], y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:,1:]),
    epochs=30,
    callbacks=[es],
    batch_size=512)


### Evaluation

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
X_index_word = X_tokenizer.index_word 
y_index_word = y_tokenizer.index_word 
y_word_index = y_tokenizer.word_index

In [None]:
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

decoder_state_input_h = Input(shape=(embedding_dim,))
decoder_state_input_c = Input(shape=(embedding_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,embedding_dim))
dec_emb2= dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])
decoder_outputs2 = decoder_dense(decoder_inf_concat)
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [None]:
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if((i != 0 and i != y_word_index['start']) and i != y_word_index['end']):
            newString = newString + y_index_word[i] + ' '
    return newString

def seq2text(input_seq):
    newString = ''
    for i in input_seq:
        if(i != 0):
            newString = newString + X_index_word[i] + ' '
    return newString

In [None]:
def decode_sequence(input_seq):
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = y_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition: 
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = y_index_word[sampled_token_index]
        
        if(sampled_token != 'end'):
            decoded_sentence += ' ' + sampled_token
        if (sampled_token == 'end'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
for i in range(10):
    print("Review:", seq2text(X_test[i]))
    print("Original summary:", seq2summary(y_test[i]))
    print("Predicted summary:", decode_sequence(X_train[i].reshape(1, max_text_len)))
    print("\n")

In [None]:

'''
Inspiration
1. https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/
'''