In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!pip install python-docx tensorflow nltk



In [7]:
import os
from docx import Document
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
file_path = '/content/drive/My Drive/AA_preprocessed_docs/preprocessed_data_summary.docx'
def read_docx(file_path):
    """
    Reads a .docx file and extracts text from all paragraphs.
    """
    doc = Document(file_path)
    text = [para.text for para in doc.paragraphs if para.text.strip() != ""]
    return text
data = read_docx(file_path)
print("Loaded Data: ", data[:5])

Loaded Data:  ['Preprocessed Data Summary', 'Processed file: wiki_09.txt', 'abydos, egypt\n\nabydos ( or ; sahidic \') is one of the oldest cities of ancient egypt, and also of the eighth nome in upper egypt. it is located about west of the nile at latitude 26° 10\' n, near the modern egyptian towns of el araba el madfuna and el balyana. in the ancient egyptian language, the city was called abedju"\' ("ꜣbḏw" or "abdw")(arabic abdu عبد-و).\nthe english name "abydos" comes from the greek , a name borrowed by greek geographers from the unrelated city of abydos on the hellespont.', 'Processed file: wiki_07.txt', 'angle\n\nin euclidean geometry, an angle is the figure formed by two rays, called the "sides" of the angle, sharing a common endpoint, called the "vertex" of the angle.\nangles formed by two rays are also known as plane angles as they lie in the plane that contains the rays. angles are also formed by the intersection of two planes; these are called "dihedral angles". two intersect

In [12]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('punkt_tab')  # Ensure punkt_tab is downloaded if missing

# Tokenize the summaries
tokenized_data = [word_tokenize(summary.lower()) for summary in data]
print("First tokenized summary: ", tokenized_data[0])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


First tokenized summary:  ['preprocessed', 'data', 'summary']


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
vocab_size = len(tokenizer.word_index) + 1
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
print("Vocabulary Size:", vocab_size)
print("Max Sequence Length:", max_seq_length)


Vocabulary Size: 2844
Max Sequence Length: 93


In [14]:
embedding_dim = 128
latent_dim = 256
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [15]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

None


In [16]:
model_architecture_path = '/content/drive/My Drive/AA_preprocessed_docs/model_architecture_summary.png'
tf.keras.utils.plot_model(model, to_file=model_architecture_path, show_shapes=True)
print(f"Model architecture saved at {model_architecture_path}")

Model architecture saved at /content/drive/My Drive/AA_preprocessed_docs/model_architecture_summary.png


In [17]:
import numpy as np

decoder_input_data = np.array(padded_sequences)
decoder_target_data = np.array(padded_sequences[:, 1:])
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_seq_length, padding='post')


In [18]:
history = model.fit(
    [padded_sequences, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),  # Adding an extra dimension for sparse_categorical_crossentropy
    batch_size=32,
    epochs=10,
    validation_split=0.2
)
model_save_path = '/content/drive/My Drive/AA_preprocessed_docs/text_summarization_model.h5'
model.save(model_save_path)
print(f"Model saved at {model_save_path}")

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.0250 - loss: 7.9499 - val_accuracy: 0.0616 - val_loss: 7.9225
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2s/step - accuracy: 0.0553 - loss: 7.8071 - val_accuracy: 0.0498 - val_loss: 7.3758
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0.0435 - loss: 7.0651 - val_accuracy: 0.0393 - val_loss: 7.0675
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.0362 - loss: 6.5516 - val_accuracy: 0.0393 - val_loss: 7.3565
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.0305 - loss: 6.5963 - val_accuracy: 0.0391 - val_loss: 7.5678
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.0328 - loss: 6.4369 - val_accuracy: 0.0441 - val_loss: 7.6390
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



Model saved at /content/drive/My Drive/AA_preprocessed_docs/text_summarization_model.h5
