In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


# **CNN**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 50  # You can adjust this based on your data
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Get the vocabulary size for the output layer
vocabulary_size = len(target_tokenizer.word_index) + 1

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=1, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),  # Adding dropout for regularization
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(50, activation='softmax')
])


# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Convert sequences back to text
predicted_summaries_text = [target_tokenizer.sequences_to_texts([seq])[0] for seq in predicted_summaries]
actual_summaries_text = [target_tokenizer.sequences_to_texts([seq])[0] for seq in y_test]

# # Calculate ROUGE score
# rouge_score = corpus_bleu([[summary.split()] for summary in actual_summaries_text], [summary.split() for summary in predicted_summaries_text])

# print("ROUGE Score:", rouge_score)

from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Convert sequences back to text for references and hypotheses
references = [summary.split() for summary in actual_summaries_text]
hypotheses = [summary.split() for summary in predicted_summaries_text]

# Calculate ROUGE scores
rouge_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
rouge_2 = corpus_bleu(references, hypotheses, weights=(0, 1, 0, 0))
rouge_l = corpus_bleu(references, hypotheses, weights=(0, 0, 1, 0))
rouge_lsum = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)
print("ROUGE-L:", rouge_l)
print("ROUGE-Lsum:", rouge_lsum)




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-Lsum: 0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # Add an out-of-vocabulary token
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # Add an out-of-vocabulary token
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 50  # You can adjust this based on your data
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build a sequence-to-sequence CNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.RepeatVector(max_summary_len),  # Match encoder and decoder lengths
    tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),  # Added dropout
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(target_tokenizer.word_index) + 1, activation='softmax'))
])

# Compile the model with 'sparse_categorical_crossentropy' loss
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy')  # Adjusted learning rate

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=16)  # Increased epochs

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)


# Implement beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j])]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    return sequences

# Perform beam search for each example in the test data
beam_search_results = [beam_search_decoder(sample, k=3) for sample in predicted_summaries]

# Convert sequences back to text and print examples
for i in range(10):  # You can change the range to print more examples
    actual_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in y_test[i] if word != 0])
    predicted_summaries_text = []
    for beam_result in beam_search_results[i]:
        predicted_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in beam_result[0] if word != 0])
        predicted_summaries_text.append(predicted_summary)
    print(f"Example - Actual Summary: {actual_summary}")
    print(f"Example - Predicted Summaries: {predicted_summaries_text}\n")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Example - Actual Summary: are the primary for the of and and models have in understanding these however the of a such as a should factors this scoping review aims to identify and the most factors as well as for of and
Example - Predicted Summaries: ['the of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of', 'of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of', 'the the of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of']

Example - Actual Summary: the of clinical is a before more natural language processing models that high accuracy for 1 experience a large of accuracy when to the of this study 

## **RNN**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 100  # You can adjust this based on your data
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.SimpleRNN(256,  activation='elu',  return_sequences=True),
    tf.keras.layers.SimpleRNN(256,  activation='elu',  return_sequences=True),
    tf.keras.layers.SimpleRNN(256,  activation='elu',  return_sequences=True),
    tf.keras.layers.SimpleRNN(256,  activation='elu',  return_sequences=True),
    tf.keras.layers.SimpleRNN(256,  activation='elu',  return_sequences=True),

    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])


# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Convert sequences back to text
# Convert sequences back to text
predicted_summaries_text = []
for seq in predicted_summaries:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    predicted_summaries_text.append(text_seq)

actual_summaries_text = []
for seq in y_test:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    actual_summaries_text.append(text_seq)


# Calculate ROUGE scores
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Calculate ROUGE scores
rouge_1 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(1, 0, 0, 0))
rouge_2 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 1, 0, 0))
rouge_l = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 0, 1, 0))
rouge_lsum = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0.25, 0.25, 0.25, 0.25))

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)
print("ROUGE-L:", rouge_l)
print("ROUGE-Lsum:", rouge_lsum)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-Lsum: 0


In [None]:
# RNN print actual summary and predicted summary

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 100  # You can adjust this based on your data
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.SimpleRNN(256, activation='elu', return_sequences=True),
    tf.keras.layers.SimpleRNN(256, activation='elu', return_sequences=True),
    tf.keras.layers.SimpleRNN(256, activation='elu', return_sequences=True),
    tf.keras.layers.SimpleRNN(256, activation='elu', return_sequences=True),
    tf.keras.layers.SimpleRNN(256, activation='elu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Implement beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j])]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    return sequences

# Perform beam search for each example in the test data
beam_search_results = [beam_search_decoder(sample, k=3) for sample in predicted_summaries]

# Convert sequences back to text and print examples
for i in range(10):  # You can change the range to print more examples
    actual_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in y_test[i] if word != 0])
    predicted_summaries_text = []
    for beam_result in beam_search_results[i]:
        predicted_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in beam_result[0] if word != 0])
        predicted_summaries_text.append(predicted_summary)
    print(f"Example - Actual Summary: {actual_summary}")
    print(f"Example - Predicted Summaries: {predicted_summaries_text}\n")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Example - Actual Summary: are the primary for the of and and models have in understanding these however the of a such as a should factors this scoping review aims to identify and the most factors as well as for of and
Example - Predicted Summaries: ['the the', 'to the', 'structured the']

Example - Actual Summary: the of clinical is a before more natural language processing models that high accuracy for 1 experience a large of accuracy when to the of this study is to develop methods that clinical the with improved we found improved models only when with in samples improving the score from 0 to 0
Example - Predicted Summaries: ['the', 'of', 'more']

Example - Actual Summary: the prevention of for patients with is still a great in clinical practice there are studies that to search for strategies to the and life for these patients we aim to the efficacy between different reported treatments by meta analysis most of the studies were high qu

# **LSTM**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 100  # You can adjust this based on your data
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.LSTM(256, return_sequences=True, activation='relu'),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])


# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Convert sequences back to text
predicted_summaries_text = []
for seq in predicted_summaries:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    predicted_summaries_text.append(text_seq)

actual_summaries_text = []
for seq in y_test:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    actual_summaries_text.append(text_seq)


# Calculate ROUGE scores
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# Calculate ROUGE scores
rouge_1 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(1, 0, 0, 0))
rouge_2 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 1, 0, 0))
rouge_l = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 0, 1, 0))
rouge_lsum = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0.25, 0.25, 0.25, 0.25))

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)
print("ROUGE-L:", rouge_l)
print("ROUGE-Lsum:", rouge_lsum)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-Lsum: 0


In [None]:
#LSTM - PRINTING ACTUAL AND PREDICTED SUMMARIES TEXT
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 100  # Match the max_len of X
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.LSTM(256, return_sequences=True, activation='relu'),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Implement beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j])]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    return sequences

# Perform beam search for each example in the test data
beam_search_results = [beam_search_decoder(sample, k=3) for sample in predicted_summaries]

# Convert sequences back to text and print examples
for i in range(10):  # You can change the range to print more examples
    actual_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in y_test[i] if word != 0])
    predicted_summaries_text = []
    for beam_result in beam_search_results[i]:
        predicted_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in beam_result[0] if word != 0])
        predicted_summaries_text.append(predicted_summary)
    print(f"Example - Actual Summary: {actual_summary}")
    print(f"Example - Predicted Summaries: {predicted_summaries_text}\n")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  candidate = [seq + [j], score * -np.log(row[j])]


Example - Actual Summary: are the primary for the of and and models have in understanding these however the of a such as a should factors this scoping review aims to identify and the most factors as well as for of and
Example - Predicted Summaries: ['in the the the is the the', 'in the the is the the', 'was the the the is the the']

Example - Actual Summary: the of clinical is a before more natural language processing models that high accuracy for 1 experience a large of accuracy when to the of this study is to develop methods that clinical the with improved we found improved models only when with in samples improving the score from 0 to 0
Example - Predicted Summaries: ['the the the the the the the the the the is is is', 'the the the the the the the the the is is is', 'the the the the the the the the the is is is']

Example - Actual Summary: the prevention of for patients with is still a great in clinical practice there are studies that to search for strategies to the and life for the

# **CNN** + RNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 48  # You can adjust this based on your data

# Padding or truncating the input sequences
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Padding or truncating the target sequences
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')


# Build and compile the CNN-RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),  # You can adjust the filter size and number of filters
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Convert sequences back to text
predicted_summaries_text = []
for seq in predicted_summaries:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    predicted_summaries_text.append(text_seq)

actual_summaries_text = []
for seq in y_test:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    actual_summaries_text.append(text_seq)

# Calculate ROUGE scores
from nltk.translate.bleu_score import corpus_bleu

# Calculate ROUGE scores
rouge_1 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(1, 0, 0, 0))
rouge_2 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 1, 0, 0))
rouge_l = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 0, 1, 0))
rouge_lsum = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0.25, 0.25, 0.25, 0.25))

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)
print("ROUGE-L:", rouge_l)
print("ROUGE-Lsum:", rouge_lsum)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-Lsum: 0


In [None]:
#cnn + rnn - PRINTING ACTUAL AND PREDICTED SUMMARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 48  # You can adjust this based on your data

# Padding or truncating the input sequences
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Padding or truncating the target sequences
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')


# Build and compile the CNN-RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),  # You can adjust the filter size and number of filters
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Implement beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j])]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    return sequences

# Perform beam search for each example in the test data
beam_search_results = [beam_search_decoder(sample, k=3) for sample in predicted_summaries]

# Convert sequences back to text and print examples
for i in range(10):  # You can change the range to print more examples
    actual_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in y_test[i] if word != 0])
    predicted_summaries_text = []
    for beam_result in beam_search_results[i]:
        predicted_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in beam_result[0] if word != 0])
        predicted_summaries_text.append(predicted_summary)
    print(f"Example - Actual Summary: {actual_summary}")
    print(f"Example - Predicted Summaries: {predicted_summaries_text}\n")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Example - Actual Summary: are the primary for the of and and models have in understanding these however the of a such as a should factors this scoping review aims to identify and the most factors as well as for of and
Example - Predicted Summaries: ['and the the the the and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and', 'the the the the the and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and', 'and and the the the and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and']

Example - Actual Summary: the of clinical is a before more natural language processing models that high accuracy f

**# biLSTM**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 48  # You can adjust this based on your data

# Padding or truncating the input sequences
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Padding or truncating the target sequences
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the BiLSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),  # You can adjust the filter size and number of filters
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, activation='relu', return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Convert sequences back to text
predicted_summaries_text = []
for seq in predicted_summaries:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    predicted_summaries_text.append(text_seq)

actual_summaries_text = []
for seq in y_test:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    actual_summaries_text.append(text_seq)

# Calculate ROUGE scores
from nltk.translate.bleu_score import corpus_bleu

# Calculate ROUGE scores
rouge_1 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(1, 0, 0, 0))
rouge_2 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 1, 0, 0))
rouge_l = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 0, 1, 0))
rouge_lsum = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0.25, 0.25, 0.25, 0.25))

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)
print("ROUGE-L:", rouge_l)
print("ROUGE-Lsum:", rouge_lsum)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-Lsum: 0


In [None]:
#bilstm - PRINTING ACTUAL AND PREDICTED SUMMARIES TEXT
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 48  # You can adjust this based on your data

# Padding or truncating the input sequences
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Padding or truncating the target sequences
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the BiLSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),  # You can adjust the filter size and number of filters
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, activation='relu', return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

predicted_summaries = model.predict(X_test)

# Implement beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j])]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    return sequences

# Perform beam search for each example in the test data
beam_search_results = [beam_search_decoder(sample, k=3) for sample in predicted_summaries]

# Convert sequences back to text and print examples
for i in range(10):  # You can change the range to print more examples
    actual_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in y_test[i] if word != 0])
    predicted_summaries_text = []
    for beam_result in beam_search_results[i]:
        predicted_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in beam_result[0] if word != 0])
        predicted_summaries_text.append(predicted_summary)
    print(f"Example - Actual Summary: {actual_summary}")
    print(f"Example - Predicted Summaries: {predicted_summaries_text}\n")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Example - Actual Summary: are the primary for the of and and models have in understanding these however the of a such as a should factors this scoping review aims to identify and the most factors as well as for of and
Example - Predicted Summaries: ['the the the the the in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in of of of of of of of of of of of of', 'the the the the the the in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in of of of of of of of of of of of of', 'the the the the in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in of of of of of of of of of of of of']

Example - Actual Summary: the of clinical is a before more natural language processing models that high accuracy for 1 experience a large of accuracy when to the of this study is to develop methods that clinical the with improved we found impr

# **CNN** + LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 48  # You can adjust this based on your data

# Padding or truncating the input sequences
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Padding or truncating the target sequences
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the CNN + LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),  # You can adjust the filter size and number of filters
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Convert sequences back to text
predicted_summaries_text = []
for seq in predicted_summaries:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    predicted_summaries_text.append(text_seq)

actual_summaries_text = []
for seq in y_test:
    text_seq = [str(word) for word in seq]
    text_seq = ' '.join(text_seq).strip()
    actual_summaries_text.append(text_seq)

# Calculate ROUGE scores
from nltk.translate.bleu_score import corpus_bleu

# Calculate ROUGE scores
rouge_1 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(1, 0, 0, 0))
rouge_2 = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 1, 0, 0))
rouge_l = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0, 0, 1, 0))
rouge_lsum = corpus_bleu([actual_summaries_text], [predicted_summaries_text], weights=(0.25, 0.25, 0.25, 0.25))

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)
print("ROUGE-L:", rouge_l)
print("ROUGE-Lsum:", rouge_lsum)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-Lsum: 0


In [None]:
#cnn+lstm PRINTING ACTUAL AND PREDICTED SUMMARIES

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import corpus_bleu

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/idp_dataset/pubmed_abstracts_clinicaltextrouge.xlsx')

# Preprocessing
data.dropna(inplace=True)  # Remove any rows with missing values
data = data[['Abstract', 'Summarized Abstract']]  # Select the relevant columns

# Split data into training (75%) and test (25%)
train_data, test_data = train_test_split(data, test_size=0.25, random_state=42)

# Tokenization and padding for text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
tokenizer.fit_on_texts(train_data['Abstract'])
X_train = tokenizer.texts_to_sequences(train_data['Abstract'])
X_test = tokenizer.texts_to_sequences(test_data['Abstract'])
max_len = 100  # You can adjust this based on your data
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Tokenize and pad the target data (Summarized Abstract)
target_tokenizer = Tokenizer(num_words=5000)  # You can adjust the num_words parameter
target_tokenizer.fit_on_texts(train_data['Summarized Abstract'])
y_train = target_tokenizer.texts_to_sequences(train_data['Summarized Abstract'])
y_test = target_tokenizer.texts_to_sequences(test_data['Summarized Abstract'])
max_summary_len = 48  # You can adjust this based on your data

# Padding or truncating the input sequences
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

# Padding or truncating the target sequences
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post', truncating='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post', truncating='post')

# Build and compile the CNN + LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=100, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),  # You can adjust the filter size and number of filters
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5000, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=16)

# Generate predicted summaries for the test data
predicted_summaries = model.predict(X_test)

# Implement beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j])]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    return sequences

# Perform beam search for each example in the test data
beam_search_results = [beam_search_decoder(sample, k=3) for sample in predicted_summaries]

# Convert sequences back to text and print examples
for i in range(10):  # You can change the range to print more examples
    actual_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in y_test[i] if word != 0])
    predicted_summaries_text = []
    for beam_result in beam_search_results[i]:
        predicted_summary = " ".join([target_tokenizer.index_word.get(word, "") for word in beam_result[0] if word != 0])
        predicted_summaries_text.append(predicted_summary)
    print(f"Example - Actual Summary: {actual_summary}")
    print(f"Example - Predicted Summaries: {predicted_summaries_text}\n")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Example - Actual Summary: are the primary for the of and and models have in understanding these however the of a such as a should factors this scoping review aims to identify and the most factors as well as for of and
Example - Predicted Summaries: ['the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the', 'in the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the', 'of the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the']

Example - Actual Summary: the of clinical is a before more natural language processing models that high accuracy for