In [None]:
!pip install tensorflow pandas




In [None]:
from google.colab import files
uploaded = files.upload()

for file_name in uploaded.keys():
    file_path = '/content/' + file_name


Saving Keyphrases_Dataset.xlsx to Keyphrases_Dataset (1).xlsx


In [None]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [None]:
from tensorflow import keras

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, AdditiveAttention
from sklearn.model_selection import train_test_split
from collections import Counter

class DataLoader:
    def __init__(self, file_path=None):
        if file_path:
            self.data = pd.read_excel(file_path)
            self.encoder_input_col = 'Paragraph'
            self.decoder_output_col = 'Keyphrases'
            self.data[self.encoder_input_col].fillna('', inplace=True)
            self.data[self.decoder_output_col].fillna('', inplace=True)
            self.data = self.data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset
            self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=42)
        else:
            self.data = None
            self.train_data = None
            self.test_data = None

    def build_vocab(self, data, num_words=10000):
        counter = Counter()
        for text in data:
            for word in text.split():
                counter[word] += 1
        most_common_words = counter.most_common(num_words - 4)  # reserve 4 for special tokens
        word_to_id = {word: idx + 4 for idx, (word, _) in enumerate(most_common_words)}
        word_to_id['<SOS>'] = 1
        word_to_id['<EOS>'] = 2
        word_to_id['<UNK>'] = 3
        word_to_id['<PAD>'] = 0
        id_to_word = {idx: word for word, idx in word_to_id.items()}
        return word_to_id, id_to_word

    def text_to_sequence(self, text, word_to_id):
        return [word_to_id.get(word, word_to_id['<UNK>']) for word in text.split()]

    def get_data(self, word_to_id):
        train_encoder_inputs = self.train_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_outputs = self.train_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in train_decoder_outputs]

        test_encoder_inputs = self.test_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_outputs = self.test_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in test_decoder_outputs]

        return (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs)

    def index_to_sent(self, indices, id_to_word):
        return ' '.join([id_to_word.get(idx, '<UNK>') for idx in indices])

class Config:
    num_words = 10000
    embed_dim = 128
    num_units = 256
    SOS_TOKEN = 1
    EOS_TOKEN = 2
    UNK_TOKEN = 3
    PAD_TOKEN = 0
    vocab_size = num_words
    batch_size = 8
    num_steps = 1000
    num_rounds = 10
    word_to_id = {}
    id_to_word = {}

cfg = Config()

def build_model(vocab_size, embed_dim, num_units):
    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_inputs)
    encoder_outputs, state_h = GRU(num_units, return_sequences=True, return_state=True, name='encoder_gru')(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_inputs)
    decoder_gru = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')
    decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=state_h)

    # Bahdanau Attention
    attention = AdditiveAttention(name='attention_layer')
    context_vector = attention([decoder_outputs, encoder_outputs])
    concat_outputs = Concatenate(axis=-1)([decoder_outputs, context_vector])

    # Dense layer
    dense_outputs = Dense(vocab_size, activation='softmax', name='dense_layer')(concat_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], dense_outputs)
    return model

def train_and_evaluate(file_path, model_dir):
    data_loader = DataLoader(file_path)
    paragraphs = data_loader.data['Paragraph'].tolist()
    keyphrases = data_loader.data['Keyphrases'].tolist()

    cfg.word_to_id, cfg.id_to_word = data_loader.build_vocab(paragraphs + keyphrases, cfg.num_words)

    (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs) = data_loader.get_data(cfg.word_to_id)

    max_encoder_len = max(map(len, train_encoder_inputs + test_encoder_inputs))
    max_decoder_len = max(map(len, train_decoder_outputs + test_decoder_outputs))

    train_encoder_inputs = pad_sequences(train_encoder_inputs, maxlen=max_encoder_len, padding='post')
    train_decoder_outputs = pad_sequences(train_decoder_outputs, maxlen=max_decoder_len, padding='post')
    train_decoder_inputs = pad_sequences(train_decoder_inputs, maxlen=max_decoder_len, padding='post')

    test_encoder_inputs = pad_sequences(test_encoder_inputs, maxlen=max_encoder_len, padding='post')
    test_decoder_outputs = pad_sequences(test_decoder_outputs, maxlen=max_decoder_len, padding='post')
    test_decoder_inputs = pad_sequences(test_decoder_inputs, maxlen=max_decoder_len, padding='post')

    model = build_model(cfg.vocab_size, cfg.embed_dim, cfg.num_units)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    model.fit([train_encoder_inputs, train_decoder_inputs], train_decoder_outputs,
              batch_size=cfg.batch_size, epochs=cfg.num_rounds, validation_data=([test_encoder_inputs, test_decoder_inputs], test_decoder_outputs))

    model.save(model_dir)

def predict_paragraph(paragraph, model_dir):
    data_loader = DataLoader()  # No file needed for single prediction
    paragraph_sequence = data_loader.text_to_sequence(paragraph, cfg.word_to_id)
    encoder_inputs = pad_sequences([paragraph_sequence], padding='post', maxlen=100)  # Adjust maxlen if necessary

    model = tf.keras.models.load_model(model_dir, custom_objects={'AdditiveAttention': AdditiveAttention})

    decoder_input = np.array([[cfg.SOS_TOKEN]])
    decoded_sentence = []

    for _ in range(100):  # assuming max length of 100
        output_tokens = model.predict([encoder_inputs, decoder_input])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = cfg.id_to_word.get(sampled_token_index, '<UNK>')

        if sampled_token == '<EOS>':
            break

        decoded_sentence.append(sampled_token)
        decoder_input = np.hstack([decoder_input, np.array([[sampled_token_index]])])

    return ' '.join(decoded_sentence)

def main():
    from google.colab import files
    uploaded = files.upload()

    for file_name in uploaded.keys():
        file_path = file_name

    model_dir = '/content/seq2seq_model'
    train_and_evaluate(file_path, model_dir)

    # Example prediction
    # paragraph = "Wildlife is essential for nature. Conservation is essential for nature. Forest is essential for nature. Dolphins are intelligent creatures. Ecosystem is essential for nature. Biodiversity is essential for nature."
    paragraph = "Chennai, formerly known as Madras, is the capital of Tamil Nadu. It is a major cultural and economic center in South India. The Marina Beach in Chennai is the second longest urban beach in the world. Chennai is known for its classical dance, music, and rich history. The city has a vibrant theatre scene. Tomatoes are grown in many parts of India."
    keyphrases = predict_paragraph(paragraph, model_dir)
    print("Predicted keyphrases:", keyphrases)

if __name__ == '__main__':
    main()


Saving chennai_keyphrases.xlsx to chennai_keyphrases.xlsx
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predicted keyphrases: <PAD> Chennai <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, AdditiveAttention
from sklearn.model_selection import train_test_split
from collections import Counter

class DataLoader:
    def __init__(self, file_path=None):
        if file_path:
            self.data = pd.read_excel(file_path)
            self.encoder_input_col = 'Paragraph'
            self.decoder_output_col = 'Keyphrases'
            self.data[self.encoder_input_col].fillna('', inplace=True)
            self.data[self.decoder_output_col].fillna('', inplace=True)
            self.data = self.data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset, returns the whole dataset again
            self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=42)
        else:
            self.data = None
            self.train_data = None
            self.test_data = None

    def build_vocab(self, data, num_words=10000):
        counter = Counter()
        for text in data:
            for word in text.split():
                counter[word] += 1
        most_common_words = counter.most_common(num_words - 4)  # reserve 4 for special tokens
        word_to_id = {word: idx + 4 for idx, (word, _) in enumerate(most_common_words)}
        word_to_id['<SOS>'] = 1
        word_to_id['<EOS>'] = 2
        word_to_id['<UNK>'] = 3
        word_to_id['<PAD>'] = 0
        id_to_word = {idx: word for word, idx in word_to_id.items()}
        return word_to_id, id_to_word

    def text_to_sequence(self, text, word_to_id):
        return [word_to_id.get(word, word_to_id['<UNK>']) for word in text.split()]

    def get_data(self, word_to_id):
        train_encoder_inputs = self.train_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_outputs = self.train_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in train_decoder_outputs]

        test_encoder_inputs = self.test_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_outputs = self.test_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in test_decoder_outputs]

        return (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs)

    def index_to_sent(self, indices, id_to_word):
        return ' '.join([id_to_word.get(idx, '<UNK>') for idx in indices])

class Config:
    num_words = 10000  #no of words in vocab
    embed_dim = 128    #word embeddings represent words as vectors in a high-dimensional space, here dimension is 128
    num_units = 256    #LSTM layer with a specified number of hidden units (256 in this case)
    SOS_TOKEN = 1      #ID of SOS in dict
    EOS_TOKEN = 2
    UNK_TOKEN = 3
    PAD_TOKEN = 0
    vocab_size = num_words #assigning 10000 to the vocab size
    batch_size = 8    #The number of samples processed together in one pass of the model.
    num_steps = 1000  #the number of steps or iterations to be performed during training, Steps are the finer unit of training progress. Multiple steps are taken within each epoch.
    #  Each step involves processing a batch of data through the model and updating the model parameters
    num_rounds = 20   #Epochs, number of rounds
    # number of times the entire dataset is passed through the model during training.
    word_to_id = {}   #intializing the dict for word to id and vice versa
    id_to_word = {}

cfg = Config()

def build_model(vocab_size, embed_dim, num_units):
    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_inputs)  #2D vector
    encoder_outputs, state_h = GRU(num_units, return_sequences=True, return_state=True, name='encoder_gru')(encoder_embedding) #initial state is zero
    # return_sequences:
    # If True, the GRU layer will return the full sequence of hidden states for each timestep.
    # If False, it returns only the last hidden state. In this case, it is set to True because we need the hidden states for each timestep for the attention mechanism.
    # return_state
    # If True, the GRU layer will return the last hidden state in addition to the full sequence of hidden states. This is useful for initializing the state of the decoder.
    # encoder_outputs: A 3D tensor of shape (batch_size, sequence_length, num_units) containing the hidden states for all timesteps in the input sequence.
    # state_h: A 2D tensor of shape (batch_size, num_units) containing the hidden state at the last timestep. This hidden state will be used to initialize the decoder.

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_inputs)
    decoder_gru = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')
    decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=state_h)
    # decoder_outputs, _ = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')(decoder_embedding, initial_state=state_h)

    # Initial State: The initial hidden state for the GRU, set to the last hidden state of the encoder (state_h).
    # decoder_outputs: A 3D tensor of shape (batch_size, sequence_length, num_units) containing the hidden states for all timesteps in the input sequence.
    # he hidden state (decoder_state_h) at the last timestep of the decoder is often not used further in simple models
    #  because the focus is on generating the entire sequence rather than what comes after the final token.

    # Bahdanau Attention
    attention = AdditiveAttention(name='attention_layer') #computes a(sj-1,hj)
    context_vector = attention([decoder_outputs, encoder_outputs])  #refer the paper for the formulae ci, alpha ij
    concat_outputs = Concatenate(axis=-1)([decoder_outputs, context_vector])
    # concat_outputs is a tensor where, for each timestep, the decoder's hidden state is concatenated with the corresponding context vector.
    # This combined information is then used for generating the final output sequence.

    # Dense layer
    dense_outputs = Dense(vocab_size, activation='softmax', name='dense_layer')(concat_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], dense_outputs)  # list of input layers and the output layer
    return model

def train_and_evaluate(file_path, model_dir):
    data_loader = DataLoader(file_path)
    paragraphs = data_loader.data['Paragraph'].tolist()
    keyphrases = data_loader.data['Keyphrases'].tolist()

    cfg.word_to_id, cfg.id_to_word = data_loader.build_vocab(paragraphs + keyphrases, cfg.num_words)

    (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs) = data_loader.get_data(cfg.word_to_id)

    max_encoder_len = max(map(len, train_encoder_inputs + test_encoder_inputs))
    max_decoder_len = max(map(len, train_decoder_outputs + test_decoder_outputs))

    train_encoder_inputs = pad_sequences(train_encoder_inputs, maxlen=max_encoder_len, padding='post')
    train_decoder_outputs = pad_sequences(train_decoder_outputs, maxlen=max_decoder_len, padding='post')
    train_decoder_inputs = pad_sequences(train_decoder_inputs, maxlen=max_decoder_len, padding='post')

    test_encoder_inputs = pad_sequences(test_encoder_inputs, maxlen=max_encoder_len, padding='post')
    test_decoder_outputs = pad_sequences(test_decoder_outputs, maxlen=max_decoder_len, padding='post')
    test_decoder_inputs = pad_sequences(test_decoder_inputs, maxlen=max_decoder_len, padding='post')

    model = build_model(cfg.vocab_size, cfg.embed_dim, cfg.num_units)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    model.fit([train_encoder_inputs, train_decoder_inputs], train_decoder_outputs,
              batch_size=cfg.batch_size, epochs=cfg.num_rounds, validation_data=([test_encoder_inputs, test_decoder_inputs], test_decoder_outputs))

    model.save(model_dir)

def predict_paragraph(paragraph, model_dir):
    data_loader = DataLoader()  # No file needed for single prediction
    paragraph_sequence = data_loader.text_to_sequence(paragraph, cfg.word_to_id)
    encoder_inputs = pad_sequences([paragraph_sequence], padding='post', maxlen=125)  # Adjust maxlen if necessary

    model = tf.keras.models.load_model(model_dir, custom_objects={'AdditiveAttention': AdditiveAttention})

    decoder_input = np.array([[cfg.SOS_TOKEN]])
    decoded_sentence = []

    for _ in range(100):  # assuming max length of 100
        output_tokens = model.predict([encoder_inputs, decoder_input])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = cfg.id_to_word.get(sampled_token_index, '<UNK>')

        if sampled_token in ['<EOS>', '<SOS>', '<UNK>', '<PAD>']:
            break

        decoded_sentence.append(sampled_token)
        decoder_input = np.hstack([decoder_input, np.array([[sampled_token_index]])])

    return ' '.join(decoded_sentence)

def main():
    from google.colab import files
    uploaded = files.upload()

    for file_name in uploaded.keys():
        file_path = file_name

    model_dir = '/content/seq2seq_model'
    train_and_evaluate(file_path, model_dir)

    # Example prediction
    # paragraph = "Understanding the effects of microgravity on the human body is crucial for long-duration space missions, such as a potential journey to Mars. The James Webb Space Telescope is expected to revolutionize our knowledge of the universe with its advanced capabilities. Space debris poses a significant risk to satellites and space missions, necessitating international cooperation for mitigation strategies. Climate change impacts agricultural productivity and food security. The digital revolution has changed the way we work and interact."
    paragraph = "Cosmic microwave background radiation provides a snapshot of the early universe, offering clues about its origin and evolution. Space probes like Voyager 1 and Voyager 2 have traveled beyond our solar system, sending back valuable data. The discovery of exoplanets has sparked interest in the possibility of extraterrestrial life. Mental health awareness has increased significantly, with more people seeking help and reducing stigma. E-commerce has transformed the retail landscape, making shopping more convenient."
    # paragraph = "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments. The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations. Space tourism is becoming a reality, with private companies planning to send civilians to space. Meanwhile, advancements in artificial intelligence are transforming industries across the globe. The importance of renewable energy cannot be overstated."
    # paragraph = "Space, with its infinite expanse and celestial wonders, has always intrigued humanity. From the rings of Saturn to the mysteries of black holes, our quest to understand the cosmos continues to drive exploration and discovery."
    keyphrases = predict_paragraph(paragraph, model_dir)
    print("Predicted keyphrases:", keyphrases)

if __name__ == '__main__':
    main()


Saving filtered_space_exploration_keyphrases.xlsx to filtered_space_exploration_keyphrases (9).xlsx
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Predicted keyphrases: Cosmic microwave background radiation, exoplanets, Space Space Space


In [None]:
from collections import Counter
import itertools

class DataLoader:
    def __init__(self, text):
        self.text = text
        self.word_to_id, self.id_to_word = self.build_vocab(text)

    def build_vocab(self, text):
        # Tokenize the text
        words = text.split()

        # Count the frequency of each word
        word_counts = Counter(words)

        # Assign an ID to each word based on frequency
        # Start indexing from 1 because 0 is reserved for <UNK>
        word_to_id = {word: idx for idx, (word, _) in enumerate(word_counts.items(), start=1)}

        # Add special tokens
        word_to_id['<UNK>'] = 0
        word_to_id['<SOS>'] = len(word_to_id)
        word_to_id['<EOS>'] = len(word_to_id) + 1

        # Create reverse dictionary
        id_to_word = {idx: word for word, idx in word_to_id.items()}

        return word_to_id, id_to_word

    def text_to_sequence(self, text, word_to_id):
        return [word_to_id.get(word, word_to_id['<UNK>']) for word in text.split()]

    def get_data(self, text):
        return self.text_to_sequence(text, self.word_to_id), self.word_to_id, self.id_to_word

# Example input text
input_text = "Wildlife is essential for nature. Conservation is essential for nature. Forest is essential for nature. Dolphins are intelligent creatures. Ecosystem is essential for nature. Biodiversity is essential for nature."
# paragraph = "Chennai, formerly known as Madras, is the capital of Tamil Nadu. It is a major cultural and economic center in South India. The Marina Beach in Chennai is the second longest urban beach in the world. Chennai is known for its classical dance, music, and rich history. The city has a vibrant theatre scene. Tomatoes are grown in many parts of India"

# Create DataLoader instance
data_loader = DataLoader(input_text)

# Get the dictionary and sequences
sequences, word_to_id, id_to_word = data_loader.get_data(input_text)

# Print the results
print("Word to ID dictionary:", word_to_id)
print("ID to Word dictionary:", id_to_word)
print("Text to Sequence:", sequences)


Word to ID dictionary: {'Wildlife': 1, 'is': 2, 'essential': 3, 'for': 4, 'nature.': 5, 'Conservation': 6, 'Forest': 7, 'Dolphins': 8, 'are': 9, 'intelligent': 10, 'creatures.': 11, 'Ecosystem': 12, 'Biodiversity': 13, '<UNK>': 0, '<SOS>': 14, '<EOS>': 16}
ID to Word dictionary: {1: 'Wildlife', 2: 'is', 3: 'essential', 4: 'for', 5: 'nature.', 6: 'Conservation', 7: 'Forest', 8: 'Dolphins', 9: 'are', 10: 'intelligent', 11: 'creatures.', 12: 'Ecosystem', 13: 'Biodiversity', 0: '<UNK>', 14: '<SOS>', 16: '<EOS>'}
Text to Sequence: [1, 2, 3, 4, 5, 6, 2, 3, 4, 5, 7, 2, 3, 4, 5, 8, 9, 10, 11, 12, 2, 3, 4, 5, 13, 2, 3, 4, 5]


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, AdditiveAttention, Dropout
from sklearn.model_selection import train_test_split
from collections import Counter

class DataLoader:
    def __init__(self, file_path=None):
        if file_path:
            self.data = pd.read_excel(file_path)
            self.encoder_input_col = 'Paragraph'
            self.decoder_output_col = 'Keyphrases'
            self.data[self.encoder_input_col].fillna('', inplace=True)
            self.data[self.decoder_output_col].fillna('', inplace=True)
            self.data = self.data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset
            self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=42)
        else:
            self.data = None
            self.train_data = None
            self.test_data = None

    def build_vocab(self, data, num_words=10000):
        counter = Counter()
        for text in data:
            for word in text.split():
                counter[word] += 1
        most_common_words = counter.most_common(num_words - 4)  # Reserve 4 for special tokens
        word_to_id = {word: idx + 4 for idx, (word, _) in enumerate(most_common_words)}
        word_to_id['<SOS>'] = 1
        word_to_id['<EOS>'] = 2
        word_to_id['<UNK>'] = 3
        word_to_id['<PAD>'] = 0
        id_to_word = {idx: word for word, idx in word_to_id.items()}
        return word_to_id, id_to_word

    def text_to_sequence(self, text, word_to_id):
        return [word_to_id.get(word, word_to_id['<UNK>']) for word in text.split()]

    def get_data(self, word_to_id):
        train_encoder_inputs = self.train_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_outputs = self.train_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in train_decoder_outputs]

        test_encoder_inputs = self.test_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_outputs = self.test_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in test_decoder_outputs]

        return (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs)

    def index_to_sent(self, indices, id_to_word):
        return ' '.join([id_to_word.get(idx, '<UNK>') for idx in indices])

class Config:
    num_words = 10000
    embed_dim = 128
    num_units = 256
    SOS_TOKEN = 1
    EOS_TOKEN = 2
    UNK_TOKEN = 3
    PAD_TOKEN = 0
    vocab_size = num_words
    batch_size = 8
    num_steps = 1000
    num_rounds = 20
    learning_rate = 1e-3
    gradient_clip_value = 0.1
    dropout_rate = 0.5
    beam_width = 10
    beam_depth = 2
    word_to_id = {}
    id_to_word = {}

cfg = Config()

def build_model(vocab_size, embed_dim, num_units, dropout_rate):
    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_inputs)
    encoder_embedding = Dropout(dropout_rate)(encoder_embedding)
    encoder_outputs, state_h = GRU(num_units, return_sequences=True, return_state=True, name='encoder_gru')(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_inputs)
    decoder_embedding = Dropout(dropout_rate)(decoder_embedding)
    decoder_gru = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')
    decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=state_h)

    # Bahdanau Attention
    attention = AdditiveAttention(name='attention_layer')
    context_vector = attention([decoder_outputs, encoder_outputs])
    concat_outputs = Concatenate(axis=-1)([decoder_outputs, context_vector])

    # Dense layer
    dense_outputs = Dense(vocab_size, activation='softmax', name='dense_layer')(concat_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], dense_outputs)
    return model

def train_and_evaluate(file_path, model_dir):
    data_loader = DataLoader(file_path)
    paragraphs = data_loader.data['Paragraph'].tolist()
    keyphrases = data_loader.data['Keyphrases'].tolist()

    cfg.word_to_id, cfg.id_to_word = data_loader.build_vocab(paragraphs + keyphrases, cfg.num_words)

    (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs) = data_loader.get_data(cfg.word_to_id)

    max_encoder_len = max(map(len, train_encoder_inputs + test_encoder_inputs))
    max_decoder_len = max(map(len, train_decoder_outputs + test_decoder_outputs))

    train_encoder_inputs = pad_sequences(train_encoder_inputs, maxlen=max_encoder_len, padding='post')
    train_decoder_outputs = pad_sequences(train_decoder_outputs, maxlen=max_decoder_len, padding='post')
    train_decoder_inputs = pad_sequences(train_decoder_inputs, maxlen=max_decoder_len, padding='post')

    test_encoder_inputs = pad_sequences(test_encoder_inputs, maxlen=max_encoder_len, padding='post')
    test_decoder_outputs = pad_sequences(test_decoder_outputs, maxlen=max_decoder_len, padding='post')
    test_decoder_inputs = pad_sequences(test_decoder_inputs, maxlen=max_decoder_len, padding='post')

    model = build_model(cfg.vocab_size, cfg.embed_dim, cfg.num_units, cfg.dropout_rate)

    optimizer = tf.keras.optimizers.Adam(learning_rate=cfg.learning_rate, clipnorm=cfg.gradient_clip_value)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    model.fit([train_encoder_inputs, train_decoder_inputs], train_decoder_outputs,
              batch_size=cfg.batch_size, epochs=cfg.num_rounds,
              validation_data=([test_encoder_inputs, test_decoder_inputs], test_decoder_outputs))

    model.save(model_dir)

def predict_paragraph(paragraph, model_dir):
    data_loader = DataLoader()
    paragraph_sequence = data_loader.text_to_sequence(paragraph, cfg.word_to_id)
    encoder_inputs = pad_sequences([paragraph_sequence], padding='post', maxlen=100)

    model = tf.keras.models.load_model(model_dir, custom_objects={'AdditiveAttention': AdditiveAttention})

    # Beam search implementation here
    def beam_search(model, encoder_inputs, beam_width, beam_depth, max_len):
        sequences = [[list(), 1.0]]
        decoder_input = np.array([[cfg.SOS_TOKEN]])

        for _ in range(max_len):
            all_candidates = []
            for seq, score in sequences:
                output_tokens = model.predict([encoder_inputs, decoder_input], verbose=0)
                for j in range(beam_width):
                    candidate = [seq + [j], score * -np.log(output_tokens[0, -1, j])]
                    all_candidates.append(candidate)
            ordered = sorted(all_candidates, key=lambda tup: tup[1])
            sequences = ordered[:beam_depth]

        return sequences[0][0]

    beam_width = cfg.beam_width
    beam_depth = cfg.beam_depth
    max_len = 100
    beam_result = beam_search(model, encoder_inputs, beam_width, beam_depth, max_len)

    keyphrases = data_loader.index_to_sent(beam_result, cfg.id_to_word)
    return keyphrases

def main():
    from google.colab import files
    uploaded = files.upload()

    for file_name in uploaded.keys():
        file_path = file_name

    model_dir = '/content/seq2seq_model'
    train_and_evaluate(file_path, model_dir)

    paragraph = "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments. The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations. Space tourism is becoming a reality, with private companies planning to send civilians to space. Meanwhile, advancements in artificial intelligence are transforming industries across the globe. The importance of renewable energy cannot be overstated."
    keyphrases = predict_paragraph(paragraph, model_dir)
    print("Predicted keyphrases:", keyphrases)

if __name__ == '__main__':
    main()


Saving filtered_space_exploration_keyphrases.xlsx to filtered_space_exploration_keyphrases (1).xlsx
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Predicted keyphrases: Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space Space


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, AdditiveAttention
from sklearn.model_selection import train_test_split
from collections import Counter

class DataLoader:
    def __init__(self, file_path=None):
        if file_path:
            self.data = pd.read_excel(file_path)
            self.encoder_input_col = 'Paragraph'
            self.decoder_output_col = 'Keyphrases'
            self.data[self.encoder_input_col].fillna('', inplace=True)
            self.data[self.decoder_output_col].fillna('', inplace=True)
            self.data = self.data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset, returns the whole dataset again
            self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=42)
        else:
            self.data = None
            self.train_data = None
            self.test_data = None

    def build_vocab(self, data, num_words=10000):
        counter = Counter()
        for text in data:
            for word in text.split():
                counter[word] += 1
        most_common_words = counter.most_common(num_words - 4)  # reserve 4 for special tokens
        word_to_id = {word: idx + 4 for idx, (word, _) in enumerate(most_common_words)}
        word_to_id['<SOS>'] = 1
        word_to_id['<EOS>'] = 2
        word_to_id['<UNK>'] = 3
        word_to_id['<PAD>'] = 0
        id_to_word = {idx: word for word, idx in word_to_id.items()}
        return word_to_id, id_to_word

    def text_to_sequence(self, text, word_to_id):
        return [word_to_id.get(word, word_to_id['<UNK>']) for word in text.split()]

    def get_data(self, word_to_id):
        train_encoder_inputs = self.train_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_outputs = self.train_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in train_decoder_outputs]

        test_encoder_inputs = self.test_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_outputs = self.test_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in test_decoder_outputs]

        return (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs)

    def index_to_sent(self, indices, id_to_word):
        return ' '.join([id_to_word.get(idx, '<UNK>') for idx in indices])

class Config:
    num_words = 10000  #no of words in vocab
    embed_dim = 128    #word embeddings represent words as vectors in a high-dimensional space, here dimension is 128
    num_units = 256    #LSTM layer with a specified number of hidden units (256 in this case)
    SOS_TOKEN = 1      #ID of SOS in dict
    EOS_TOKEN = 2
    UNK_TOKEN = 3
    PAD_TOKEN = 0
    vocab_size = num_words #assigning 10000 to the vocab size
    batch_size = 8    #The number of samples processed together in one pass of the model.
    num_steps = 1000  #the number of steps or iterations to be performed during training, Steps are the finer unit of training progress. Multiple steps are taken within each epoch.
    #  Each step involves processing a batch of data through the model and updating the model parameters
    num_rounds = 20   #Epochs, number of rounds
    # number of times the entire dataset is passed through the model during training.
    word_to_id = {}   #intializing the dict for word to id and vice versa
    id_to_word = {}

cfg = Config()

def build_model(vocab_size, embed_dim, num_units):
    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_inputs)  #2D vector
    encoder_outputs, state_h = GRU(num_units, return_sequences=True, return_state=True, name='encoder_gru')(encoder_embedding) #initial state is zero
    # return_sequences:
    # If True, the GRU layer will return the full sequence of hidden states for each timestep.
    # If False, it returns only the last hidden state. In this case, it is set to True because we need the hidden states for each timestep for the attention mechanism.
    # return_state
    # If True, the GRU layer will return the last hidden state in addition to the full sequence of hidden states. This is useful for initializing the state of the decoder.
    # encoder_outputs: A 3D tensor of shape (batch_size, sequence_length, num_units) containing the hidden states for all timesteps in the input sequence.
    # state_h: A 2D tensor of shape (batch_size, num_units) containing the hidden state at the last timestep. This hidden state will be used to initialize the decoder.

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_inputs)
    decoder_gru = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')
    decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=state_h)
    # decoder_outputs, _ = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')(decoder_embedding, initial_state=state_h)

    # Initial State: The initial hidden state for the GRU, set to the last hidden state of the encoder (state_h).
    # decoder_outputs: A 3D tensor of shape (batch_size, sequence_length, num_units) containing the hidden states for all timesteps in the input sequence.
    # he hidden state (decoder_state_h) at the last timestep of the decoder is often not used further in simple models
    #  because the focus is on generating the entire sequence rather than what comes after the final token.

    # Bahdanau Attention
    attention = AdditiveAttention(name='attention_layer') #computes a(sj-1,hj)
    context_vector = attention([decoder_outputs, encoder_outputs])  #refer the paper for the formulae ci, alpha ij
    concat_outputs = Concatenate(axis=-1)([decoder_outputs, context_vector])
    # concat_outputs is a tensor where, for each timestep, the decoder's hidden state is concatenated with the corresponding context vector.
    # This combined information is then used for generating the final output sequence.

    # Dense layer
    dense_outputs = Dense(vocab_size, activation='softmax', name='dense_layer')(concat_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], dense_outputs)  # list of input layers and the foutput layer
    return model

def train_and_evaluate(file_path, model_dir):
    data_loader = DataLoader(file_path)
    paragraphs = data_loader.data['Paragraph'].tolist()
    keyphrases = data_loader.data['Keyphrases'].tolist()

    cfg.word_to_id, cfg.id_to_word = data_loader.build_vocab(paragraphs + keyphrases, cfg.num_words)

    (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs) = data_loader.get_data(cfg.word_to_id)

    max_encoder_len = max(map(len, train_encoder_inputs + test_encoder_inputs))
    max_decoder_len = max(map(len, train_decoder_outputs + test_decoder_outputs))

    train_encoder_inputs = pad_sequences(train_encoder_inputs, maxlen=max_encoder_len, padding='post')
    train_decoder_outputs = pad_sequences(train_decoder_outputs, maxlen=max_decoder_len, padding='post')
    train_decoder_inputs = pad_sequences(train_decoder_inputs, maxlen=max_decoder_len, padding='post')

    test_encoder_inputs = pad_sequences(test_encoder_inputs, maxlen=max_encoder_len, padding='post')
    test_decoder_outputs = pad_sequences(test_decoder_outputs, maxlen=max_decoder_len, padding='post')
    test_decoder_inputs = pad_sequences(test_decoder_inputs, maxlen=max_decoder_len, padding='post')

    model = build_model(cfg.vocab_size, cfg.embed_dim, cfg.num_units)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    model.fit([train_encoder_inputs, train_decoder_inputs], train_decoder_outputs,
              batch_size=cfg.batch_size, epochs=cfg.num_rounds, validation_data=([test_encoder_inputs, test_decoder_inputs], test_decoder_outputs))

    model.save(model_dir)

def predict_paragraph(paragraph, model_dir):
    data_loader = DataLoader()  # No file needed for single prediction
    paragraph_sequence = data_loader.text_to_sequence(paragraph, cfg.word_to_id)
    encoder_inputs = pad_sequences([paragraph_sequence], padding='post', maxlen=125)  # Adjust maxlen if necessary

    model = tf.keras.models.load_model(model_dir, custom_objects={'AdditiveAttention': AdditiveAttention})

    decoder_input = np.array([[cfg.SOS_TOKEN]])
    decoded_sentence = []

    for _ in range(100):  # assuming max length of 100
        output_tokens = model.predict([encoder_inputs, decoder_input])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = cfg.id_to_word.get(sampled_token_index, '<UNK>')

        if sampled_token in ['<EOS>', '<SOS>', '<UNK>', '<PAD>']:
            break

        decoded_sentence.append(sampled_token)
        decoder_input = np.hstack([decoder_input, np.array([[sampled_token_index]])])

    keyphrases = set(decoded_sentence)

    print("Generated Keyphrases:", keyphrases)

    # Find sentences in the paragraph containing keyphrases
    sentences = paragraph.split('.')
    sentences_with_keyphrases = []

    for sentence in sentences:
        for keyphrase in keyphrases:
            if keyphrase in sentence:
                sentences_with_keyphrases.append(sentence.strip())
                break

    print("\nSentences containing keyphrases:")
    for sentence in sentences_with_keyphrases:
        print(sentence)

    return ' '.join(decoded_sentence)

def main():
    from google.colab import files
    uploaded = files.upload()

    for file_name in uploaded.keys():
        file_path = file_name

    model_dir = '/content/seq2seq_model'
    train_and_evaluate(file_path, model_dir)

    # Example prediction
    # paragraph = "Cosmic microwave background radiation provides a snapshot of the early universe, offering clues about its origin and evolution. Space probes like Voyager 1 and Voyager 2 have traveled beyond our solar system, sending back valuable data. The discovery of exoplanets has sparked interest in the possibility of extraterrestrial life. Mental health awareness has increased significantly, with more people seeking help and reducing stigma. E-commerce has transformed the retail landscape, making shopping more convenient."
    paragraph = "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments. The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations. Space tourism is becoming a reality, with private companies planning to send civilians to space. Meanwhile, advancements in artificial intelligence are transforming industries across the globe. The importance of renewable energy cannot be overstated."
    keyphrases = predict_paragraph(paragraph, model_dir)
    # print("Predicted Keyphrases:", keyphrases)

if __name__ == '__main__':
    main()


Saving filtered_space_exploration_keyphrases.xlsx to filtered_space_exploration_keyphrases (2).xlsx
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Generated Keyphrases: {'black', 'Space', 'holes,'}

Sentences containing keyphrases:
Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments
Space tourism is becoming a reality, with private companies planning to send civilians to space


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Concatenate, AdditiveAttention
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

class DataLoader:
    def __init__(self, file_path=None):
        if file_path:
            self.data = pd.read_excel(file_path)
            self.encoder_input_col = 'Paragraph'
            self.decoder_output_col = 'Keyphrases'
            self.data[self.encoder_input_col].fillna('', inplace=True)
            self.data[self.decoder_output_col].fillna('', inplace=True)
            self.data = self.data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset, returns the whole dataset again
            self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=42)
        else:
            self.data = None
            self.train_data = None
            self.test_data = None

    def build_vocab(self, data, num_words=10000):
        counter = Counter()
        for text in data:
            for word in text.split():
                counter[word] += 1
        most_common_words = counter.most_common(num_words - 4)  # reserve 4 for special tokens
        word_to_id = {word: idx + 4 for idx, (word, _) in enumerate(most_common_words)}
        word_to_id['<SOS>'] = 1
        word_to_id['<EOS>'] = 2
        word_to_id['<UNK>'] = 3
        word_to_id['<PAD>'] = 0
        id_to_word = {idx: word for word, idx in word_to_id.items()}
        return word_to_id, id_to_word

    def text_to_sequence(self, text, word_to_id):
        return [word_to_id.get(word, word_to_id['<UNK>']) for word in text.split()]

    def get_data(self, word_to_id):
        train_encoder_inputs = self.train_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_outputs = self.train_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        train_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in train_decoder_outputs]

        test_encoder_inputs = self.test_data[self.encoder_input_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_outputs = self.test_data[self.decoder_output_col].apply(lambda x: self.text_to_sequence(x, word_to_id)).tolist()
        test_decoder_inputs = [[word_to_id['<SOS>']] + seq for seq in test_decoder_outputs]

        return (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs)

    def index_to_sent(self, indices, id_to_word):
        return ' '.join([id_to_word.get(idx, '<UNK>') for idx in indices])

class Config:
    num_words = 10000  #no of words in vocab
    embed_dim = 128    #word embeddings represent words as vectors in a high-dimensional space, here dimension is 128
    num_units = 256    #LSTM layer with a specified number of hidden units (256 in this case)
    SOS_TOKEN = 1      #ID of SOS in dict
    EOS_TOKEN = 2
    UNK_TOKEN = 3
    PAD_TOKEN = 0
    vocab_size = num_words #assigning 10000 to the vocab size
    batch_size = 8    #The number of samples processed together in one pass of the model.
    num_steps = 1000  #the number of steps or iterations to be performed during training, Steps are the finer unit of training progress. Multiple steps are taken within each epoch.
    #  Each step involves processing a batch of data through the model and updating the model parameters
    num_rounds = 20   #Epochs, number of rounds
    # number of times the entire dataset is passed through the model during training.
    word_to_id = {}   #intializing the dict for word to id and vice versa
    id_to_word = {}

cfg = Config()

def build_model(vocab_size, embed_dim, num_units):
    # Encoder
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    encoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_inputs)  #2D vector
    encoder_outputs, state_h = GRU(num_units, return_sequences=True, return_state=True, name='encoder_gru')(encoder_embedding) #initial state is zero
    # return_sequences:
    # If True, the GRU layer will return the full sequence of hidden states for each timestep.
    # If False, it returns only the last hidden state. In this case, it is set to True because we need the hidden states for each timestep for the attention mechanism.
    # return_state
    # If True, the GRU layer will return the last hidden state in addition to the full sequence of hidden states. This is useful for initializing the state of the decoder.
    # encoder_outputs: A 3D tensor of shape (batch_size, sequence_length, num_units) containing the hidden states for all timesteps in the input sequence.
    # state_h: A 2D tensor of shape (batch_size, num_units) containing the hidden state at the last timestep. This hidden state will be used to initialize the decoder.

    # Decoder
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    decoder_embedding = Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_inputs)
    decoder_gru = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')
    decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=state_h)
    # decoder_outputs, _ = GRU(num_units, return_sequences=True, return_state=True, name='decoder_gru')(decoder_embedding, initial_state=state_h)

    # Initial State: The initial hidden state for the GRU, set to the last hidden state of the encoder (state_h).
    # decoder_outputs: A 3D tensor of shape (batch_size, sequence_length, num_units) containing the hidden states for all timesteps in the input sequence.
    # he hidden state (decoder_state_h) at the last timestep of the decoder is often not used further in simple models
    #  because the focus is on generating the entire sequence rather than what comes after the final token.

    # Bahdanau Attention
    attention = AdditiveAttention(name='attention_layer') #computes a(sj-1,hj)
    context_vector = attention([decoder_outputs, encoder_outputs])  #refer the paper for the formulae ci, alpha ij
    concat_outputs = Concatenate(axis=-1)([decoder_outputs, context_vector])
    # concat_outputs is a tensor where, for each timestep, the decoder's hidden state is concatenated with the corresponding context vector.
    # This combined information is then used for generating the final output sequence.

    # Dense layer
    dense_outputs = Dense(vocab_size, activation='softmax', name='dense_layer')(concat_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], dense_outputs)  # list of input layers and the foutput layer
    return model

def train_and_evaluate(file_path, model_dir):
    data_loader = DataLoader(file_path)
    paragraphs = data_loader.data['Paragraph'].tolist()
    keyphrases = data_loader.data['Keyphrases'].tolist()

    cfg.word_to_id, cfg.id_to_word = data_loader.build_vocab(paragraphs + keyphrases, cfg.num_words)

    (train_encoder_inputs, train_decoder_outputs, train_decoder_inputs), (test_encoder_inputs, test_decoder_outputs, test_decoder_inputs) = data_loader.get_data(cfg.word_to_id)

    max_encoder_len = max(map(len, train_encoder_inputs + test_encoder_inputs))
    max_decoder_len = max(map(len, train_decoder_outputs + test_decoder_outputs))

    train_encoder_inputs = pad_sequences(train_encoder_inputs, maxlen=max_encoder_len, padding='post')
    train_decoder_outputs = pad_sequences(train_decoder_outputs, maxlen=max_decoder_len, padding='post')
    train_decoder_inputs = pad_sequences(train_decoder_inputs, maxlen=max_decoder_len, padding='post')

    test_encoder_inputs = pad_sequences(test_encoder_inputs, maxlen=max_encoder_len, padding='post')
    test_decoder_outputs = pad_sequences(test_decoder_outputs, maxlen=max_decoder_len, padding='post')
    test_decoder_inputs = pad_sequences(test_decoder_inputs, maxlen=max_decoder_len, padding='post')

    model = build_model(cfg.vocab_size, cfg.embed_dim, cfg.num_units)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    model.fit([train_encoder_inputs, train_decoder_inputs], train_decoder_outputs,
              batch_size=cfg.batch_size, epochs=cfg.num_rounds, validation_data=([test_encoder_inputs, test_decoder_inputs], test_decoder_outputs))

    model.save(model_dir)

def get_bert_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def predict_paragraph(paragraph, model_dir):
    data_loader = DataLoader()  # No file needed for single prediction
    paragraph_sequence = data_loader.text_to_sequence(paragraph, cfg.word_to_id)
    encoder_inputs = pad_sequences([paragraph_sequence], padding='post', maxlen=125)  # Adjust maxlen if necessary

    model = tf.keras.models.load_model(model_dir, custom_objects={'AdditiveAttention': AdditiveAttention})

    decoder_input = np.array([[cfg.SOS_TOKEN]])
    decoded_sentence = []

    for _ in range(100):  # assuming max length of 100
        output_tokens = model.predict([encoder_inputs, decoder_input])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = cfg.id_to_word.get(sampled_token_index, '<UNK>')

        if sampled_token in ['<EOS>', '<SOS>', '<UNK>', '<PAD>']:
            break

        decoded_sentence.append(sampled_token)
        decoder_input = np.hstack([decoder_input, np.array([[sampled_token_index]])])

    keyphrases = set(decoded_sentence)

    print("Generated Keyphrases:", keyphrases)

    # Load BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')

    # Find sentences in the paragraph containing keyphrases
    sentences = paragraph.split('.')
    sentences_with_keyphrases = []

    for sentence in sentences:
        for keyphrase in keyphrases:
            if keyphrase in sentence:
                sentences_with_keyphrases.append(sentence.strip())
                break

    print("\nSentences containing keyphrases:")
    context_embeddings = get_bert_embedding(paragraph, bert_model, tokenizer)
    for sentence in sentences_with_keyphrases:
        sentence_embedding = get_bert_embedding(sentence, bert_model, tokenizer)
        similarity = cosine_similarity(context_embeddings, sentence_embedding)
        if similarity > 0.7:  # Adjust threshold as needed
            print(sentence)

    return ' '.join(decoded_sentence)

def main():
    from google.colab import files
    uploaded = files.upload()

    for file_name in uploaded.keys():
        file_path = file_name

    model_dir = '/content/seq2seq_model'
    train_and_evaluate(file_path, model_dir)

    # Example prediction
    paragraph = "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments. Astronauts spend months training for their missions to explore outer Space. The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations. Everyone needs their own personal Space to feel comfortable. Space tourism is becoming a reality, with private companies planning to send civilians to space. Meanwhile, advancements in artificial intelligence are transforming industries across the globe. The importance of renewable energy cannot be overstated. The Space in the room is congested. Cybersecurity is crucial to protect personal information in cyber Space."
    keyphrases = predict_paragraph(paragraph, model_dir)
    print("Predicted Keyphrases:", keyphrases)

if __name__ == '__main__':
    main()



Saving filtered_space_exploration_keyphrases.xlsx to filtered_space_exploration_keyphrases (8).xlsx
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Generated Keyphrases: {'black', 'Space'}

Sentences containing keyphrases:
Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments
Astronauts spend months training for their missions to explore outer Space
Space tourism is becoming a reality, with private companies planning to send civilians to space
Cybersecurity is crucial to protect personal information in cyber Space
Predicted Keyphrases: black Space Space Space Space Space Space


In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from gensim.models import Word2Vec
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Sample document
doc1 = """Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments.
Astronauts spend months training for their missions to explore outer space.
Space tourism is becoming a reality, with private companies planning to send civilians to space.
Cybersecurity is crucial to protect personal information in cyberspace.
The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations.
Everyone needs their own personal Space to feel comfortable."""
  # Everyone needs their own personal Space to feel comfortable. Space tourism is becoming a reality, with private companies planning to send civilians to space. Meanwhile, advancements in artificial intelligence are transforming industries across the globe. The importance of renewable energy cannot be overstated. The Space in the room is congested. Cybersecurity is crucial to protect personal information in cyber Space."
# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    return jaccard_score(X.toarray()[0], X.toarray()[1])

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)

    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt')
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)

    return np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# SBERT Similarity
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

def hybrid_similarity(doc1, doc2):
    weights = [0.1, 0.20, 0.20, 0.25, 0.25]
    high_threshold = 0.8
    low_threshold = 0.2

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    bert = bert_similarity(doc1, doc2)[0][0]
    sbert = sbert_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    if all(score > high_threshold for score in normalized_scores):
        highest_two_avg = np.mean(sorted(normalized_scores)[-2:])
        final_score = highest_two_avg
        weights = [0.1, 0.1, 0.1, 0.35, 0.35]
    elif all(score < low_threshold for score in normalized_scores):
        lowest_two_avg = np.mean(sorted(normalized_scores)[:2])
        final_score = lowest_two_avg
        weights = [0.3, 0.25, 0.25, 0.1, 0.1]
    else:
        weighted_scores = np.array(normalized_scores) * np.array(weights)
        final_score = np.sum(weighted_scores)

    return final_score

# Split doc1 into sentences
sentences = doc1.split('\n')

# Compute and print similarity scores for each pair of sentences
for i in range(len(sentences)):
    for j in range(i + 1, len(sentences)):
        score = hybrid_similarity(sentences[i], sentences[j])
        print(f"Similarity between Sentence {i + 1} and Sentence {j + 1}: {score}")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity between Sentence 1 and Sentence 2: 0.6239437648301754
Similarity between Sentence 1 and Sentence 3: 0.5385404245222958
Similarity between Sentence 1 and Sentence 4: 0.4166949771709768
Similarity between Sentence 1 and Sentence 5: 0.4976379528650424
Similarity between Sentence 1 and Sentence 6: 0.4803276670193618
Similarity between Sentence 2 and Sentence 3: 0.5186347642744739
Similarity between Sentence 2 and Sentence 4: 0.37507193551578255
Similarity between Sentence 2 and Sentence 5: 0.4978667441238725
Similarity between Sentence 2 and Sentence 6: 0.43743267487392434
Similarity between Sentence 3 and Sentence 4: 0.453920696810722
Similarity between Sentence 3 and Sentence 5: 0.46172762771858533
Similarity between Sentence 3 and Sentence 6: 0.5638159321788667
Similarity between Sentence 4 and Sentence 5: 0.5102454902392545
Similarity between Sentence 4 and Sentence 6: 0.4675859682692919
Similarity between Sentence 5 and Sentence 6: 0.4296332835218064


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
def calculate_threshold(similarity_scores, text_length):
    # Statistical approach
    mean_similarity = sum(similarity_scores) / len(similarity_scores)
    std_dev = (sum((x - mean_similarity) ** 2 for x in similarity_scores) / len(similarity_scores)) ** 0.5
    stat_threshold = mean_similarity - 1.5 * std_dev

    # Percentile approach
    percentile_threshold = np.percentile(similarity_scores, 10)

    # Adjust based on text length
    length_factor = min(1, text_length / 1000)  # Normalize for texts up to 1000 words

    # Combine methods
    threshold = (stat_threshold + percentile_threshold) / 2
    threshold = threshold * (1 - length_factor) + percentile_threshold * length_factor

    return threshold

data = [
    # "This is an example sentence about machine learning.",
    # "Bananas are yellow and tasty.",
    # "Machine learning models are used for predictions.",
    # "The weather today is sunny.",
    # "Deep learning is a subset of machine learning.",
    # "Cats are cute and playful."
    # "It is raining now",
    # "The weather looks good today",
    # "It might become hot at night today",
    # "My neck hurts today"
    # "I drank banana milkshake",
    # "The banana was very sweet",
    # "An apple was red",
    # "My neck hurts today"
    # "Two people are walking near the two cars parked on the two way trunk road.",
    # "Suddenly two more people joined and four are walking.",
    # "two plus two makes four"
    "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments.",
    " Astronauts spend months training for their missions to explore outer Space.",
    " The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations.",
    "Everyone needs their own personal Space to feel comfortable.",
    "Space tourism is becoming a reality, with private companies planning to send civilians to space.",
    # "Meanwhile, advancements in artificial intelligence are transforming industries across the globe.",
    # "The importance of renewable energy cannot be overstated.",
    "The Space in the room is congested.",
    " Cybersecurity is crucial to protect personal information in cyber Space."

]

if len(data) < 3 :
  print("We need a minimum of 3 sentences to derive context from the given text")
  exit()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    # tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens1 = []
    for i in tokens:
      if i not in stopwords.words('english'):
        tokens1.append(i)

    return ' '.join(tokens1)

cleaned_data = []
for sentence in data:
  cleaned_data.append(preprocess_text(sentence))

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

embeddings = model.encode(cleaned_data, convert_to_tensor=True)
central_theme_embedding = torch.mean(embeddings, dim=0)

similarity_scores = []
for embedding in embeddings:
  similarity_scores.append(util.pytorch_cos_sim(embedding, central_theme_embedding).item())
  print(util.pytorch_cos_sim(embedding, central_theme_embedding).item())


# Usage
text_length = sum(len(sentence.split()) for sentence in data)
threshold = calculate_threshold(similarity_scores, text_length)
print("Threshold:", threshold)
irrelevant_sentences = []

for i, score in enumerate(similarity_scores):
  if score < threshold:
    irrelevant_sentences.append(data[i])

print("Irrelevant Sentences in the given text are:")
print(irrelevant_sentences)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0.6156331300735474
0.648171067237854
0.5137473940849304
0.6060574054718018
0.6619881391525269
0.5701948404312134
0.5292788743972778
Threshold: 0.5186389994859476
Irrelevant Sentences in the given text are:
[' The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations.']


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

data = [
    "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments.",
    " Astronauts spend months training for their missions to explore outer Space."," The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations.",
    "Everyone needs their own personal Space to feel comfortable.",
    "Space tourism is becoming a reality, with private companies planning to send civilians to space.",
    # "Meanwhile, advancements in artificial intelligence are transforming industries across the globe.",
    # "The importance of renewable energy cannot be overstated.",
    "The Space in the room is congested.",
    " Cybersecurity is crucial to protect personal information in cyber Space."
]

if len(data) < 3:
    print("We need a minimum of 3 sentences to derive context from the given text")
    exit()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens1 = [i for i in tokens if i not in stopwords.words('english')]
    return ' '.join(tokens1)

cleaned_data = [preprocess_text(sentence) for sentence in data]

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

embeddings = model.encode(cleaned_data, convert_to_tensor=True)

similarity_matrix = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()
# print(similarity_matrix)

avg_similarities = np.mean(similarity_matrix, axis=1)

threshold = np.mean(avg_similarities) - np.std(avg_similarities)
print(threshold)

irrelevant_indices = np.where(avg_similarities > threshold)[0]
irrelevant_sentences = [data[i] for i in irrelevant_indices]

print("Similarity Matrix:")
print(similarity_matrix)
print("\nAverage Similarities:")
print(avg_similarities)
print("\nIrrelevant Sentences in the given text are:")
print(irrelevant_sentences)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0.3210798
Similarity Matrix:
[[0.99999994 0.6310089  0.29350367 0.10637853 0.34520525 0.13779137
  0.06132852]
 [0.6310089  1.0000002  0.3496607  0.12253731 0.37589586 0.08275425
  0.11256436]
 [0.29350367 0.3496607  0.99999994 0.01171482 0.24408276 0.01250419
  0.3344466 ]
 [0.10637853 0.12253731 0.01171482 1.         0.34167624 0.54474217
  0.31631863]
 [0.34520525 0.37589586 0.24408276 0.34167624 1.         0.26944038
  0.22418846]
 [0.13779137 0.08275425 0.01250419 0.54474217 0.26944038 1.
  0.19336566]
 [0.06132852 0.11256436 0.3344466  0.31631863 0.22418846 0.19336566
  0.9999999 ]]

Average Similarities:
[0.367888   0.38206023 0.32084468 0.3490525  0.40006986 0.32008544
 0.32031605]

Irrelevant Sentences in the given text are:
['Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments.', ' Astronauts spend months training for their missions to explore outer Space.', 'E

In [None]:
!pip install accelerate -U


Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
!pip install accelerate -U
!pip install transformers[torch]


Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
import pandas as pd
from transformers import BertTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Load dataset
file_path = list(uploaded.keys())[0]
df = pd.read_excel(file_path)

# Inspect the DataFrame structure
print(df.head())

# Ensure the column names match your Excel file
text_column = 'Paragraphs'  # Change this if the text column has a different name
label_column = 'Keyphrases'  # Change this if the label column has a different name

# Map string labels to integers
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
df[label_column] = df[label_column].map(label_mapping)

# Define a Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Preprocessing
def preprocess_data(data, max_len=100):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    texts = data[text_column].tolist()
    labels = data[label_column].tolist()
    return CustomDataset(texts, labels, tokenizer, max_len)

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = preprocess_data(train_df)
test_dataset = preprocess_data(test_df)

# Load model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label_mapping))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',  # Use eval_strategy instead of evaluation_strategy
    learning_rate=1e-5,
    save_total_limit=2,
    save_steps=1000,
    load_best_model_at_end=True,
    save_strategy='epoch'  # Ensure save_strategy matches eval_strategy
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=None  # Define metrics if needed
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

# Predict on the test dataset
predictions = trainer.predict(test_dataset)

# Extract the predicted class labels
predicted_labels = predictions.predictions.argmax(axis=1)

# Invert the label mapping dictionary
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Convert integer predictions to string labels
predicted_labels_str = [inverse_label_mapping[label] for label in predicted_labels]

# Create a DataFrame with the original paragraphs and their predicted keyphrases
output_df = pd.DataFrame({
    'Paragraphs': test_df[text_column].values,
    'Predicted_Keyphrases': predicted_labels_str
})

# Save the DataFrame to a CSV file
output_file_path = 'predicted_keyphrases.csv'
output_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")


Saving space_paragraphs.xlsx to space_paragraphs (1).xlsx
                                          Paragraphs  \
0  The exoplanet is a fascinating subject in mode...   
1  The spacecraft is a fascinating subject in mod...   
2  The exoplanet is a fascinating subject in mode...   
3  The art exhibition was really enjoyable. The e...   
4  The astronaut is a fascinating subject in mode...   

                                          Keyphrases  
0  astronomy, exoplanet, meteor shower, alien lif...  
1                          spacecraft, rocket launch  
2  exoplanet, space exploration, black hole, luna...  
3                                  exoplanet, nebula  
4                              astronaut, alien life  


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,3.892043
2,3.884600,3.892265
3,3.884600,3.892617
4,3.893300,3.893108
5,3.893300,3.893768
6,3.899500,3.894547
7,3.899500,3.895439
8,3.888900,3.896554
9,3.888900,3.89783
10,3.871100,3.899096


{'eval_loss': 3.892042636871338, 'eval_runtime': 0.1063, 'eval_samples_per_second': 94.034, 'eval_steps_per_second': 18.807, 'epoch': 10.0}
Predictions saved to predicted_keyphrases.csv


In [None]:
!pip install accelerate -U
!pip install transformers[torch]


Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.32.1


In [None]:
import pandas as pd
from transformers import BertTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Load dataset
file_path = list(uploaded.keys())[0]
df = pd.read_excel(file_path)

# Inspect the DataFrame structure
print(df.head())

# Ensure the column names match your Excel file
text_column = 'Paragraph'  # Change this if the text column has a different name
label_column = 'Keyphrases'  # Change this if the label column has a different name

# Map string labels to integers
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
df[label_column] = df[label_column].map(label_mapping)

# Define a Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Preprocessing
def preprocess_data(data, max_len=100):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    texts = data[text_column].tolist()
    labels = data[label_column].tolist()
    return CustomDataset(texts, labels, tokenizer, max_len)

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = preprocess_data(train_df)
test_dataset = preprocess_data(test_df)

# Load model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=len(label_mapping))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',  # Use eval_strategy instead of evaluation_strategy
    learning_rate=1e-5,
    save_total_limit=2,
    save_steps=1000,
    load_best_model_at_end=True,
    save_strategy='epoch'  # Ensure save_strategy matches eval_strategy
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=None  # Define metrics if needed
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

# Predict on the test dataset
predictions = trainer.predict(test_dataset)

# Extract the predicted class labels
predicted_labels = predictions.predictions.argmax(axis=1)

# Invert the label mapping dictionary
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Convert integer predictions to string labels
predicted_labels_str = [inverse_label_mapping[label] for label in predicted_labels]

# Create a DataFrame with the original paragraphs and their predicted keyphrases
output_df = pd.DataFrame({
    'Paragraphs': test_df[text_column].values,
    'Predicted_Keyphrases': predicted_labels_str
})

# Save the DataFrame to a CSV file
output_file_path = 'predicted_keyphrases.csv'
output_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

# Define the function to predict keyphrases for a given paragraph
def predict_paragraph(paragraph, model, tokenizer, max_len=100):
    model.eval()
    inputs = tokenizer.encode_plus(
        paragraph,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_label = logits.argmax(axis=1).item()

    return inverse_label_mapping[predicted_label]

# Example usage of the predict_paragraph function
paragraph = "Astronauts aboard the International Space Station conduct experiments that have applications on Earth, such as developing new materials and improving medical treatments. Astronauts spend months training for their missions to explore outer Space. The search for extraterrestrial intelligence involves scanning the cosmos for signals that might indicate the presence of alien civilizations. Everyone needs their own personal Space to feel comfortable. Space tourism is becoming a reality, with private companies planning to send civilians to space. Meanwhile, advancements in artificial intelligence are transforming industries across the globe. The importance of renewable energy cannot be overstated. The Space in the room is congested. Cybersecurity is crucial to protect personal information in cyber Space."
keyphrases = predict_paragraph(paragraph, model, BertTokenizer.from_pretrained('bert-base-uncased'))
print("Predicted Keyphrases:", keyphrases)


Saving filtered_space_exploration_keyphrases.xlsx to filtered_space_exploration_keyphrases (2).xlsx
                                           Paragraph  \
0  Space exploration has led to numerous technolo...   
1  The study of black holes has intrigued scienti...   
2  Cosmic microwave background radiation provides...   
3  Astronauts aboard the International Space Stat...   
4  Understanding the effects of microgravity on t...   

                                          Keyphrases  
0  Space exploration, satellites, International S...  
1    black holes, Space missions to Mars, astronauts  
2  Cosmic microwave background radiation, exoplan...  
3  International Space Station, extraterrestrial ...  
4  microgravity, James Webb Space Telescope, Spac...  


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`