In [None]:
#data cleaning 
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    return text

def tokenize_text(text):
    return text.split()

input_text = "This is a sample input text for summarization."
summary_text = "This is a summary of the input text."

clean_input_text = clean_text(input_text)
clean_summary_text = clean_text(summary_text)

tokenized_input_text = tokenize_text(clean_input_text)
tokenized_summary_text = tokenize_text(clean_summary_text)



In [None]:
# Word Sense Disambiguation (WSD)
# For WSD, we can use various pre-trained models such as WordNet or Lesk algorithm. Here's an example code using the Lesk algorithm for WSD:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

def wsd_sentence(sentence):
    tokens = word_tokenize(sentence)
    disambiguated_sentence = []
    for token in tokens:
        synset = lesk(tokens, token)
        if synset is not None:
            disambiguated_sentence.append(synset.name())
    return " ".join(disambiguated_sentence)

disambiguated_input_text = wsd_sentence(input_text)
disambiguated_summary_text = wsd_sentence(summary_text)


In [None]:
# Sequence-to-Sequence (Seq2Seq) Model
# For implementing a Seq2Seq model, we can use various libraries such as TensorFlow, PyTorch, or Keras. Here's an example code for a simple Seq2Seq model in Keras that incorporates WSD:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

# Define input and output sequences
input_seq = Input(shape=(None,))
output_seq = Input(shape=(None,))

# Define embedding layer
vocab_size = 10000
embedding_size = 100
embed_layer = Embedding(vocab_size, embedding_size)

# Define encoder and decoder layers
encoder_lstm = LSTM(256, return_state=True)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
dense_layer = Dense(vocab_size, activation='softmax')

# Define encoder model
embed_input = embed_layer(input_seq)
encoder_output, state_h, state_c = encoder_lstm(embed_input)
encoder_states = [state_h, state_c]
encoder_model = Model(input_seq, encoder_states)

# Define decoder model
embed_output = embed_layer(output_seq)
decoder_output, _, _ = decoder_lstm(embed_output, initial_state=encoder_states)
decoder_output = dense_layer(decoder_output)
decoder_model = Model([output_seq] + encoder_states, [decoder_output])

# Define Seq2Seq model
seq2seq_output = decoder_model([output_seq] + encoder_model(input_seq))
seq2seq_model = Model([input_seq, output_seq], seq2seq_output)


In [None]:
# In this code, we first load the pre-trained BERT model and tokenizer using the AutoTokenizer and TFAutoModel classes from the transformers library. 
# Then, we define the input and target texts and encode them using the tokenizer, producing input and target encodings in the form of tensors.
#  We pass these tensors to the BERT model to obtain the encoded representations of the input and target texts.
# Next, we use the encoded representations to generalize the content of the input and target texts by taking the mean of the embeddings 
# along the sequence length dimension. We then use the top_k function to filter out the top 10 most important embeddings, and use the gather 
# function to extract the corresponding embeddings from the input and target encoded representations.
# Overall, this code demonstrates how to use a pre-trained language model like BERT to encode and generalize the 
# content of input and target texts, and how to use the encoded representations to filter out irrelevant information.
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = TFAutoModel.from_pretrained(model_name)

# Define input and target texts
input_texts = ['This is a sample input text.', 'Another input text.']
target_texts = ['This is a summary of the first input text.', 'Summary of the second input text.']

# Encode the input and target texts using BERT
input_encodings = tokenizer(input_texts, truncation=True, padding=True, return_tensors='tf')
target_encodings = tokenizer(target_texts, truncation=True, padding=True, return_tensors='tf')

input_embeddings = bert(input_encodings['input_ids'])[0]
target_embeddings = bert(target_encodings['input_ids'])[0]

# Use the encoded representations to generalize the content
input_summary = tf.reduce_mean(input_embeddings, axis=1)
target_summary = tf.reduce_mean(target_embeddings, axis=1)

# Filter out irrelevant information
input_filter = tf.math.top_k(input_summary, k=10)
target_filter = tf.math.top_k(target_summary, k=10)

input_summary = tf.gather(input_embeddings, input_filter.indices, batch_dims=1)
target_summary = tf.gather(target_embeddings, target_filter.indices, batch_dims=1)


In [None]:
# To fine-tune a Sequence-to-Sequence model on a large dataset of input-output text pairs and evaluate the quality of the generated summaries, 
# we can use a loss function that combines metrics such as ROUGE and BLEU. Here's an example code in Python using the tensorflow and nltk libraries:

import tensorflow as tf
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Define the loss function
def combined_loss(y_true, y_pred):
    # Convert the predicted and target summaries to text
    target_seq = y_true[:, 1:]
    pred_seq = tf.argmax(y_pred[:, :-1, :], axis=-1)
    target_text = [tokenizer.decode(seq) for seq in target_seq]
    pred_text = [tokenizer.decode(seq) for seq in pred_seq]

    # Calculate the ROUGE and BLEU scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(pred_text, target_text)
    rouge_score = sum([score['rouge-l']['f'] for score in rouge_scores]) / len(rouge_scores)

    bleu_scores = [sentence_bleu([ref.split()], hyp.split()) for ref, hyp in zip(target_text, pred_text)]
    bleu_score = sum(bleu_scores) / len(bleu_scores)

    # Combine the ROUGE and BLEU scores to obtain the loss
    alpha = 0.5
    loss = alpha * (1 - rouge_score) + (1 - alpha) * (1 - bleu_score)

    return loss

# Load the dataset of input-output text pairs
dataset = tf.data.Dataset.from_tensor_slices((input_texts, target_texts))

# Tokenize the input and output sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(input_texts + target_texts)

input_seqs = tokenizer.texts_to_sequences(input_texts)
target_seqs = tokenizer.texts_to_sequences(target_texts)

# Pad the input and output sequences to a fixed length
max_seq_len = 128
input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, maxlen=max_seq_len, padding='post')
target_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_seqs, maxlen=max_seq_len, padding='post')

# Define the Seq2Seq model architecture
embedding_dim = 128
hidden_dim = 256
encoder_inputs = tf.keras.layers.Input(shape=(max_seq_len,))
encoder_embedding = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(max_seq_len,))
decoder_embedding = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile the model and fit it to the dataset
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss=combined_loss)

model.fit([input_seqs, target_seqs[:, :-1]], target_seqs[:, 1:], batch_size=32, epochs=10)



In [None]:
# To test the trained Sequence-to-Sequence model on a held-out dataset and fine-tune the model as needed to improve its performance, we can use the following code in Python:
# Load the held-out dataset of input-output text pairs
test_dataset = tf.data.Dataset.from_tensor_slices((test_input_texts, test_target_texts))

# Tokenize the input and output sequences
test_input_seqs = tokenizer.texts_to_sequences(test_input_texts)
test_target_seqs = tokenizer.texts_to_sequences(test_target_texts)

# Pad the input and output sequences to a fixed length
test_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_input_seqs, maxlen=max_seq_len, padding='post')
test_target_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_target_seqs, maxlen=max_seq_len, padding='post')

# Evaluate the model on the held-out dataset
scores = model.evaluate([test_input_seqs, test_target_seqs[:, :-1]], test_target_seqs[:, 1:], verbose=0)
print('Test loss:', scores)

# Generate summaries for a few input texts and compare with the target summaries
for i in range(10):
    input_seq = input_seqs[i:i+1]
    target_seq = target_seqs[i:i+1, 1:]
    pred_seq = model.predict([input_seq, target_seq], verbose=0)
    pred_text = tokenizer.decode(tf.squeeze(tf.argmax(pred_seq, axis=-1)))
    target_text = tokenizer.decode(tf.squeeze(target_seq))
    print('Input text:', input_texts[i])
    print('Predicted summary:', pred_text)
    print('Target summary:', target_text)
    print()
# In this code, we load the held-out dataset of input-output text pairs and tokenize the input and output sequences. We then pad the sequences to a fixed length and 
# evaluate the model on the held-out dataset using the evaluate method. We print the test loss to see how well the model performs on the held-out dataset.
# We also generate summaries for a few input texts using the predict method of the model and compare the predicted summaries with the target summaries. 
# We print the input text, predicted summary, and target summary to see how well the model performs on these examples.
# Based on the test loss and the quality of the generated summaries, we can fine-tune the model as needed to improve its performance. 
# This may involve adjusting the model architecture, hyperparameters, or loss function, or collecting more training data.