In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
data = []
with open('gigaword/train.jsonl','r') as file:
    for line in file:
        data.append(json.loads(line))

In [3]:
data = pd.DataFrame(data)
data

Unnamed: 0,id,text,summary
0,gigaword-train-0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,gigaword-train-1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,gigaword-train-2,australian shares closed down #.# percent mond...,australian stocks close down #.# percent
3,gigaword-train-3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,gigaword-train-4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy
...,...,...,...
999995,gigaword-train-999995,after proclaiming a special relationship with ...,indian leader vajpayee to meet with bush to di...
999996,gigaword-train-999996,a group of people expelled by the british from...,former residents of indian ocean island demand...
999997,gigaword-train-999997,a mix of profit-taking and cautiousness guided...,stocks lower in early trading
999998,gigaword-train-999998,"hungary 's air carrier , malev , has grounded ...",hungarian air carrier grounds flights to bosnia


In [4]:
data['text']

0         australia 's current account deficit shrunk by...
1         at least two people were killed in a suspected...
2         australian shares closed down #.# percent mond...
3         south korea 's nuclear envoy kim sook urged no...
4         south korea on monday announced sweeping tax r...
                                ...                        
999995    after proclaiming a special relationship with ...
999996    a group of people expelled by the british from...
999997    a mix of profit-taking and cautiousness guided...
999998    hungary 's air carrier , malev , has grounded ...
999999    a ##-year-old-girl who struck prince charles i...
Name: text, Length: 1000000, dtype: object

In [5]:
data = data.applymap(lambda x: str(x).replace('#', ''))

In [6]:
data['text'][0]

"australia 's current account deficit shrunk by a record . billion dollars -lrb- . billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed ."

In [7]:
data.head()

Unnamed: 0,id,text,summary
0,gigaword-train-0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,gigaword-train-1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,gigaword-train-2,australian shares closed down . percent monday...,australian stocks close down . percent
3,gigaword-train-3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,gigaword-train-4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy


In [8]:
texts = data['text'].tolist()[:1000]
summaries = data['summary'].tolist()[:1000]

In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [10]:
tokenizer_texts = Tokenizer()

Updates internal vocabulary based on a list of texts.

        In the case where texts contains lists,
        we assume each entry of the lists to be a token.

In [11]:
tokenizer_texts.fit_on_texts(texts)

Transforms each text in texts to a sequence of integers.

In [12]:
text_sequences = tokenizer_texts.texts_to_sequences(texts)

In [13]:
len(tokenizer_texts.word_index)

4176

In [14]:
print(text_sequences[0])

[813, 9, 998, 2394, 591, 2395, 15, 2, 999, 123, 49, 191, 123, 14, 192, 5, 1, 592, 686, 265, 4, 2396, 687, 31, 1000, 542, 75, 336]


In [15]:
max_text_len = max([len(seq) for seq in text_sequences])

In [16]:
max_text_len

49

In [17]:
padded_text_sequences = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')

In [18]:
padded_text_sequences

array([[ 813,    9,  998, ...,    0,    0,    0],
       [  21,  131,   41, ...,    0,    0,    0],
       [ 266,   87,   74, ...,    0,    0,    0],
       ...,
       [ 690,   32, 4158, ...,    0,    0,    0],
       [  86,   85,   47, ...,    0,    0,    0],
       [  29,    9,    1, ...,    0,    0,    0]])

In [19]:
tokenizer_summary = Tokenizer()
tokenizer_summary.fit_on_texts(summaries)
summary_sequences = tokenizer_summary.texts_to_sequences(summaries)
max_summary_len = max([len(seq) for seq in summary_sequences])

In [20]:
padded_summary_sequences = pad_sequences(summary_sequences, maxlen=max_summary_len, padding='post')

In [21]:
X_train, X_val, y_train, y_val = train_test_split(padded_text_sequences, padded_summary_sequences, test_size=0.2, random_state=42)

In [22]:
X_train.shape

(800, 49)

In [23]:
latent_dim = 256

In [24]:
encoder_inputs = Input(shape=(max_text_len, ), dtype='int32',)

In [25]:
encoder_LSTM = LSTM(latent_dim, return_state=True)

In [26]:
import keras

In [27]:
embedding_layer = keras.layers.Embedding(
    input_dim = 78852,
    output_dim = 50,
    embeddings_initializer="uniform")

In [28]:
encoder_embedding = embedding_layer(encoder_inputs)

In [29]:
encoder_embedding

<KerasTensor: shape=(None, 49, 50) dtype=float32 (created by layer 'embedding')>

In [30]:
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

In [31]:
decoder_inputs = Input(shape=(max_summary_len, ), dtype='int32',)

In [32]:
decoder_embedding = embedding_layer(decoder_inputs)

In [33]:
decoder_embedding

<KerasTensor: shape=(None, 13, 50) dtype=float32 (created by layer 'embedding')>

In [34]:
decoder_LSTM = LSTM(latent_dim, return_state=True, return_sequences=True)

In [35]:
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])

In [36]:
decoder_dense = Dense(len(tokenizer_summary.word_index) + 1, activation='softmax')

In [37]:
decoder_outputs = decoder_dense(decoder_outputs)

In [38]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [39]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 13)]                 0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 49)]                 0         []                            
                                                                                                  
 embedding (Embedding)       multiple                     3942600   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 256),                314368    ['embedding[0][0]']       

In [40]:
encoder_input_data = np.zeros((len(text_sequences), max_text_len, len(tokenizer_texts.word_index) + 1), dtype='float32')

In [41]:
for i, seq in enumerate(text_sequences):
    for j, word_idx in enumerate(seq):
        encoder_input_data[i, j, word_idx] = 1.0

In [43]:
encoder_input_data.shape

(1000, 49, 4177)

In [44]:
decoder_input_data = np.zeros((len(summary_sequences), max_summary_len, len(tokenizer_summary.word_index) + 1), dtype='float32')
decoder_target_data = np.zeros((len(summary_sequences), max_summary_len, len(tokenizer_summary.word_index) + 1), dtype='float32')

In [45]:
for i, seq in enumerate(summary_sequences):
    for j, word_idx in enumerate(seq):
        decoder_input_data[i, j, word_idx] = 1.0
        if j > 0:
            decoder_target_data[i, j - 1, word_idx] = 1.0

In [47]:
decoder_input_data.shape

(1000, 13, 2025)

In [49]:
decoder_target_data.shape

(1000, 13, 2025)

In [51]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [54]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 13)]                 0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 49)]                 0         []                            
                                                                                                  
 embedding (Embedding)       multiple                     3942600   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 256),                314368    ['embedding[0][0]']       

In [59]:
# Prepare encoder_input_data
encoder_input_data = np.zeros((len(text_sequences), max_text_len, len(tokenizer_texts.word_index) + 1), dtype='float32')
for i, seq in enumerate(text_sequences):
    for j, word_idx in enumerate(seq):
        encoder_input_data[i, j, word_idx - 1] = 1.0  # Adjusting index by -1 because tokenizer index starts from 1

# Prepare decoder_input_data and decoder_target_data
decoder_input_data = np.zeros((len(summary_sequences), max_summary_len, len(tokenizer_summary.word_index) + 1), dtype='float32')
decoder_target_data = np.zeros((len(summary_sequences), max_summary_len, len(tokenizer_summary.word_index) + 1), dtype='float32')
for i, seq in enumerate(summary_sequences):
    for j, word_idx in enumerate(seq):
        decoder_input_data[i, j, word_idx - 1] = 1.0  # Adjusting index by -1 because tokenizer index starts from 1
        if j > 0:
            decoder_target_data[i, j - 1, word_idx - 1] = 1.0  # Adjusting index by -1 because tokenizer index starts from 1

# Define RNN model
latent_dim = 256

# Define encoder model
encoder_inputs = Input(shape=(max_text_len, len(tokenizer_texts.word_index) + 1))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Define decoder model
decoder_inputs = Input(shape=(max_summary_len, len(tokenizer_summary.word_index) + 1))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer_summary.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64,
          epochs=50,
          validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2607b4ae0d0>