In [1]:
import os
import re
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Embedding, Dense, Attention
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split


In [21]:
# Load and preprocess data
data_dir = r"C:\Users\HP\Suvidha ML internship\cnn_stories_tokenized"
def load_data(data_dir):
   
    texts = []
    summaries = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.story'):
            with open(os.path.join(data_dir, filename), 'r', encoding= 'utf-8') as f:
                content = f.read()
                parts = content.split('@highlight')  # Split using the delimiter
                text = parts[0]  # Extract text
                
                # Join highlights into a single summary, separated by newlines
                summary = '\n'.join(h.strip() for h in parts[1:])

                texts.append(text)
                summaries.append(summary)  # Append the combined summary

                
    return texts, summaries

def preprocess_data(texts, summaries):
      # Tokenize, clean, lowercase, etc.
    processed_texts = [re.findall(r'\w+', text.lower()) for text in texts]
    processed_summaries = [re.findall(r'\w+', summary.lower()) for summary in summaries]
    return processed_texts, processed_summaries

def create_vocabulary(texts):

    vocabulary = set()  # Use a set for efficient collection of unique words
    for text in texts:
        vocabulary.update(text)  # Add unique words from each text to the vocabulary

    return vocabulary


def split_data(texts, summaries):
    """
    Splits preprocessed text and summary data into training, validation, and test sets.

    Args:
        texts: A list of preprocessed text documents.
        summaries: A list of corresponding preprocessed summaries.

    Returns:
        A tuple containing six lists:
        - train_texts: Preprocessed texts for training.
        - train_summaries: Preprocessed summaries for training.
        - val_texts: Preprocessed texts for validation.
        - val_summaries: Preprocessed summaries for validation.
        - test_texts: Preprocessed texts for testing.
        - test_summaries: Preprocessed summaries for testing.
    """

    # Split into training and test sets (80/20 split)
    train_texts, test_texts, train_summaries, test_summaries = train_test_split(
        texts, summaries, test_size=0.2, random_state=42
    )

    # Further split the training set into training and validation sets (80/20 split)
    train_texts, val_texts, train_summaries, val_summaries = train_test_split(
        train_texts, train_summaries, test_size=0.2, random_state=42
    )

    return train_texts, train_summaries, val_texts, val_summaries, test_texts, test_summaries





In [3]:

texts, summaries = load_data(data_dir)

In [4]:
len(summaries)

92579

In [5]:
len(texts)

92579

In [6]:

texts_preprocessed, summaries_preprocessed = preprocess_data(texts, summaries)

In [7]:
len(texts_preprocessed)
#summaries_preprocessed

92579

In [8]:


# Create vocabulary
vocab = create_vocabulary(texts_preprocessed)  # Define your own vocabulary creation function


In [20]:
print(len(vocab))
type(vocab)

236890


set

In [10]:

# Split data
train_texts, train_summaries, val_texts, val_summaries, test_texts, test_summaries = split_data(texts_preprocessed, summaries_preprocessed)  # Define your own split_data function


In [22]:
vocab_size = 236890
embedding_dim = 100

# Define input layers
encoder_inputs = tf.keras.Input(shape=(None,))
decoder_inputs = tf.keras.Input(shape=(None,))

# Encoder
encoder_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(units=256, return_state=True)(encoder_embedding)
encoder_outputs, state_h, state_c = encoder_lstm
encoder_states = [state_h, state_c]

# Decoder
decoder_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(units=256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)


# Attention
attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs])

# Concatenate attention with decoder output
decoder_concat_input = tf.keras.layers.Concatenate()([decoder_outputs, attention])

# Generate vocabulary distribution
decoder_dense = tf.keras.layers.Dense(units=vocab_size)
decoder_outputs = decoder_dense(decoder_concat_input)


# Pointer-generator mechanism
vocab_dist_pg = tf.keras.layers.Activation('softmax', name='vocab_dist')(decoder_outputs)
attn_dist_pg = tf.keras.layers.Activation('softmax', name='attn_dist')(attention)
attention_projection = tf.keras.layers.Dense(units=vocab_size)(attention)
final_dist = tf.keras.layers.Add()([vocab_dist_pg, attention_projection])


# Build the model with the custom layer
model = tf.keras.Model([encoder_inputs, decoder_inputs], final_dist)



In [13]:
# Convert set to dictionary
vocab_dict = {word: i for i, word in enumerate(vocab)}


In [14]:
train_texts_int = [[vocab_dict.get(token, 1) for token in text] for text in train_texts]  # Replace OOV tokens with 1
train_summaries_int = [[vocab_dict.get(token, 1) for token in summary] for summary in train_summaries]

In [23]:
# Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Example optimizer
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Example loss
model.compile(optimizer=optimizer, loss=loss_fn)

In [24]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, None, 100)            2368900   ['input_3[0][0]']             
                                                          0                                       
                                                                                                  
 embedding_3 (Embedding)     (None, None, 100)            2368900   ['input_4[0][0]']       