In [None]:
!pip install transformers
!pip install evaluate
!pip install torch
!pip install sentencepiece
!pip install protobuf

# Read extracted text from file

In [None]:
file = open("extracted_text.txt", "r", encoding="utf-8")
long_text = file.read()
print(long_text)
file.close()

# Split text into trunks

In [None]:
import math
import torch
from transformers import BartTokenizer

def chunk_text_into_tokens(text, tokenizer, max_tokens=500):
    tokens = tokenizer.tokenize(text)
    token_chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    count = len(token_chunks)
    return token_chunks, count

max_text_length = 4096



# Model: BART

In [None]:
def summarize_chunks(token_chunks, max_sum_length, min_sum_length, tokenizer, model):
    summaries = []

    for chunk in token_chunks:
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        inputs = tokenizer(chunk_text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
        summary_ids = model.generate(**inputs, max_length=max_sum_length, min_length=min_sum_length, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

def concatenate_summaries(summaries):
    return " ".join(summaries)

In [None]:
%%time

from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer_bart = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model_bart = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

text_chunks, num_chunk = chunk_text_into_tokens(long_text, tokenizer_bart)
max_sum_length = min(500, math.floor(max_text_length / num_chunk))
min_sum_length = math.floor(0.5 * max_sum_length)

chunk_summaries_bart = summarize_chunks(text_chunks, max_sum_length, min_sum_length, tokenizer_bart, model_bart)

final_summary_bart = concatenate_summaries(chunk_summaries_bart)

tokens_bart = tokenizer_bart.tokenize(final_summary_bart)
number_of_tokens_bart = len(tokens_bart)
print("number_of_tokens: ", number_of_tokens_bart)

print(final_summary_bart)

In [None]:
with open("summary_bart.txt", "w", encoding="utf-8") as file:
    file.write(final_summary_bart)

# Model: T5

In [None]:
def summarize_chunks_T5(chunks, max_sum_length, min_sum_length, tokenizer, model):
    summaries = []

    for chunk in chunks:
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        inputs = tokenizer("summarize: "+chunk_text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
        summary_ids = model.generate(**inputs, max_length=max_sum_length, min_length=min_sum_length, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

In [None]:
%%time

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer_T5= T5Tokenizer.from_pretrained("t5-base", legacy=False) 
model_T5 = T5ForConditionalGeneration.from_pretrained("t5-base")

text_chunks, num_chunk = chunk_text_into_tokens(long_text, tokenizer_T5)
max_sum_length = min(500, math.floor(max_text_length / num_chunk))
min_sum_length = math.floor(0.5 * max_sum_length)

chunk_summaries_T5 = summarize_chunks_T5(text_chunks, max_sum_length, min_sum_length, tokenizer_T5, model_T5)

final_summary_T5 = concatenate_summaries(chunk_summaries_T5)

tokens_T5 = tokenizer_T5.tokenize(final_summary_T5)
number_of_tokens_T5 = len(tokens_T5)
print("number_of_tokens: ", number_of_tokens_T5)

print(final_summary_T5)

In [None]:
with open("summary_T5.txt", "w", encoding="utf-8") as file:
    file.write(final_summary_T5)

# Model: LED

In [None]:
%%time
from transformers import LEDTokenizer, LEDForConditionalGeneration

tokenizer_LED = LEDTokenizer.from_pretrained("allenai/led-base-16384")
model_LED = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

text_chunks, num_chunk = chunk_text_into_tokens(long_text, tokenizer_LED)
max_sum_length = min(500, math.floor(max_text_length / num_chunk))
min_sum_length = math.floor(0.5 * max_sum_length)

chunk_summaries_LED = summarize_chunks(text_chunks, max_sum_length, min_sum_length, tokenizer_LED, model_LED)

final_summary_LED = concatenate_summaries(chunk_summaries_LED)

tokens_LED = tokenizer_LED.tokenize(final_summary_LED)
number_of_tokens_LED = len(tokens_LED)
print("number_of_tokens: ", number_of_tokens_LED)

print(final_summary_LED)

In [None]:
with open("summary_LED.txt", "w", encoding="utf-8") as file:
    file.write(final_summary_LED)

# Model: Pegasus

In [None]:
%%time

from transformers import PegasusForConditionalGeneration, AutoTokenizer

tokenizer_Pegasus = AutoTokenizer.from_pretrained("google/pegasus-xsum")
model_Pegasus = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

text_chunks, num_chunk = chunk_text_into_tokens(long_text, tokenizer_Pegasus)
max_sum_length = min(500, math.floor(max_text_length / num_chunk))
min_sum_length = math.floor(0.5 * max_sum_length)

chunk_summaries_Pegasus = summarize_chunks(text_chunks, max_sum_length, min_sum_length, tokenizer_Pegasus, model_Pegasus)

final_summary_Pegasus = concatenate_summaries(chunk_summaries_Pegasus)

tokens_Pegasus = tokenizer_Pegasus.tokenize(final_summary_Pegasus)
number_of_tokens_Pegasus = len(tokens_Pegasus)
print("number_of_tokens: ", number_of_tokens_Pegasus)

print(final_summary_Pegasus)

In [None]:
with open("summary_Pegasus.txt", "w", encoding="utf-8") as file:
    file.write(final_summary_Pegasus)

# Model: BigBirdPegasus

In [None]:
%%time

from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration

tokenizer_BigBird = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
model_BigBird = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

text_chunks, num_chunk = chunk_text_into_tokens(long_text, tokenizer_BigBird)
max_sum_length = min(500, math.floor(max_text_length / num_chunk))
min_sum_length = math.floor(0.5 * max_sum_length)

chunk_summaries_BigBird = summarize_chunks(text_chunks, max_sum_length, min_sum_length, tokenizer_BigBird, model_BigBird)

final_summary_BigBird = concatenate_summaries(chunk_summaries_BigBird)

tokens_BigBird = tokenizer_BigBird.tokenize(final_summary_BigBird)
number_of_tokens_BigBird = len(tokens_BigBird)
print("number_of_tokens: ", number_of_tokens_BigBird)

print(final_summary_BigBird)

In [None]:
with open("summary_BigBird.txt", "w", encoding="utf-8") as file:
    file.write(final_summary_BigBird)

# Model: ProphetNet

In [None]:
%%time

from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

tokenizer_prophetNet = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
model_prophetNet = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

text_chunks, num_chunk = chunk_text_into_tokens(long_text, tokenizer_prophetNet)
max_sum_length = min(500, math.floor(max_text_length / num_chunk))
min_sum_length = math.floor(0.5 * max_sum_length)

chunk_summaries_prophetNet = summarize_chunks(text_chunks, max_sum_length, min_sum_length, tokenizer_prophetNet, model_prophetNet)

final_summary_prophetNet = concatenate_summaries(chunk_summaries_prophetNet)

tokens_prophetNet = tokenizer_blenderBot.tokenize(final_summary_prophetNet)
number_of_tokens_prophetNet = len(tokens_prophetNet)
print("number_of_tokens: ", number_of_tokens_prophetNet)

print(final_summary_prophetNet)

In [None]:
with open("summary_prophetNet.txt", "w", encoding="utf-8") as file:
    file.write(final_summary_prophetNet)

# Model: LlaMa2

# Model: Mistral

# Model: Gemma

# Read GPT4 generated summary as standard for comparison

In [None]:
file = open("gpt_summary.txt", "r", encoding="utf-8")
gpt_summary = file.read()
print(gpt_summary)
file.close()

# Criteria: ROUGE score

In [None]:
import evaluate

# Initialize the rouge evaluator
rouge = evaluate.load("rouge")

# Your data: a list of reference summaries and a list of generated summaries
references = gpt_summary
predictions = ["Your model generated summary text here."]

# Calculate ROUGE scores
rouge_scores = rouge.compute(predictions=predictions, references=references)

print("ROUGE scores:", rouge_scores)

# Criteria: BLEU score

In [None]:
# Initialize the bleu evaluator
bleu = evaluate.load("bleu")

# Calculate BLEU scores
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

print("BLEU score:", bleu_score)

# Criteria: METEOR score

In [None]:
# Initialize the meteor evaluator
meteor = evaluate.load("meteor")

# Calculate METEOR scores
meteor_score = meteor.compute(predictions=predictions, references=references)

print("METEOR score:", meteor_score)