In [None]:
! pip install datasets tensorflow



In [None]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense
from datasets import load_dataset

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess(example):
    article = example["article"].strip()
    summary = example["highlights"].strip()
    return {"article": article, "highlights": summary}

In [None]:
train_data = dataset["train"].map(preprocess)
test_data = dataset["test"].map(preprocess)

In [None]:
train_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [None]:
test_data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 11490
})

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(train_data['article'])
tokenizer.fit_on_texts(train_data['highlights'])

In [None]:
max_article_length = max(len(article.split()) for article in train_data['article'])
max_summary_length = max(len(summary.split()) for summary in train_data['highlights'])

In [None]:
max_summary_length

1296

In [None]:
max_article_length

2347

In [None]:
max_article_length = 500
max_summary_length = 200
train_article_seqs = tokenizer.texts_to_sequences(train_data['article'])
train_summary_seqs = tokenizer.texts_to_sequences(train_data['highlights'])# Pad sequences
train_article_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_article_seqs, maxlen=max_article_length, padding='post')
train_summary_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_summary_seqs, maxlen=max_summary_length, padding='post')

## Encoder and Decoder Using LSTM

In [None]:
embedding_dim = 64
hidden_units = 128

encoder_inputs = Input(shape=(max_article_length,))
encoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_article_length, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_summary_length - 1,))
decoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_summary_length - 1, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = Dense(len(tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 200)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 199)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 200, 64)              1343057   ['input_1[0][0]']             
                                                          92                                      
                                                                                                  
 embedding_1 (Embedding)     (None, 199, 64)              1343057   ['input_2[0][0]']         

In [None]:
input_seqs = train_summary_seqs[:, :-1]
target_seqs = train_summary_seqs[:, 1:]

In [None]:
def data_generator(data, batch_size):
    while True:
        for i in range(0, len(data['article']), batch_size):
            batch_articles = data['article'][i:i+batch_size]
            batch_highlights = data['highlights'][i:i+batch_size]

            # Filter out problematic examples
            filtered_indices = [idx for idx, article in enumerate(batch_articles) if isinstance(article, str) and isinstance(batch_highlights[idx], str)]
            if not filtered_indices:
                continue

            # Filter the batches
            batch_articles = [batch_articles[idx] for idx in filtered_indices]
            batch_highlights = [batch_highlights[idx] for idx in filtered_indices]

            # Convert text to sequences
            batch_article_seqs = tokenizer.texts_to_sequences(batch_articles)
            batch_summary_seqs = tokenizer.texts_to_sequences(batch_highlights)

            # Pad sequences
            batch_article_seqs = tf.keras.preprocessing.sequence.pad_sequences(batch_article_seqs, maxlen=max_article_length, padding='post')
            batch_summary_seqs = tf.keras.preprocessing.sequence.pad_sequences(batch_summary_seqs, maxlen=max_summary_length, padding='post')

            # Perform any additional preprocessing here if needed
            yield [batch_article_seqs, batch_summary_seqs[:, :-1]], batch_summary_seqs[:, 1:]


In [None]:
# Training the model
model.fit(data_generator(train_data, batch_size=32), epochs=10, steps_per_epoch=len(train_data)//32, validation_data=data_generator(test_data, batch_size=32), validation_steps=len(test_data)//32)

In [4]:
test_generator = data_generator(test_data, batch_size=32)

# Evaluate the model on the test dataset
evaluation_result = model.evaluate(test_generator, steps=len(test_data)//32)

# Extract accuracy from the evaluation result
accuracy = evaluation_result[1]

print(f"{accuracy}")

.9372349383126


## Model gives a 93% accuracy

In [None]:
model.save("text_summarization.h5")