In [1]:
import certifi
print(certifi.where())


C:\ProgramData\Anaconda3\lib\site-packages\certifi\cacert.pem


In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer, BigBirdPegasusForConditionalGeneration, T5ForConditionalGeneration, T5Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer, AutoTokenizer
from rouge_score import rouge_scorer

def summarize_text_bart(document):
    # Load pre-trained BART model and tokenizer
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Tokenize and encode the document
    inputs = tokenizer.encode("summarize: " + document, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

def summarize_text_bigbird_pegasus(document):
    # Load pre-trained BigBird Pegasus model and tokenizer
    model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
    tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

    # Tokenize and encode the document
    inputs = tokenizer(document, return_tensors='pt')

    # Generate summary with necessary parameters
    prediction = model.generate(
        inputs['input_ids'],  # Provide input IDs explicitly
        max_length=120,  # Adjusted for conciseness
        num_beams=2,  # Reduced for efficiency
        early_stopping=True  # To stop early when possible
    )

    summary = tokenizer.batch_decode(prediction)[0]

    return summary

def summarize_text_t5(document):
    # Load pre-trained T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # Tokenize and encode the document
    inputs = tokenizer("summarize: " + document, return_tensors='pt')

    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

def summarize_text_pegasus(document):
    # Load pre-trained Pegasus model and tokenizer
    model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')
    tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')

    # Tokenize and encode the document
    inputs = tokenizer(document, return_tensors='pt')

    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

def calculate_rouge_scores(reference_text, generated_summary):
    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_summary)

    return scores

if __name__ == "__main__":
    # Example document
    input_document = """
    Adani is also among the most controversial of India’s billionaires for his association with the Bharatiya Janata Party (BJP). His close relationship with the party is not coincidental: Adani frequently refers to his business strategy as motivated by “nation building,” which the Adani Group describes on its website as “helping build world-class infrastructure capabilities to help accelerate the growth of India.” Mundra Port and its associated Adani Special Economic Zone, the central components of Adani’s business empire, were attained and developed in cooperation with the Gujarat state government. The BJP led the Gujarat state government during key moments of the Adani Group’s growth, and the relationship resulted in the symbiotic rise of both the BJP and the Adani Group.
    """

    # Summarize the document using different models
    bart_summary = summarize_text_bart(input_document)
    bigbird_pegasus_summary = summarize_text_bigbird_pegasus(input_document)
    t5_summary = summarize_text_t5(input_document)
    pegasus_summary = summarize_text_pegasus(input_document)

    # Print the results
    print("Original Document:")
    print(input_document)
    print("\nBART Summary:")
    print(bart_summary)
    print("\nBigBird Pegasus Summary:")
    print(bigbird_pegasus_summary)
    print("\nT5 Summary:")
    print(t5_summary)
    print("\nPegasus Summary:")
    print(pegasus_summary)

    # Compare ROUGE scores for each model
    reference_text = input_document
    rouge_scores_bart = calculate_rouge_scores(reference_text, bart_summary)
    rouge_scores_bigbird_pegasus = calculate_rouge_scores(reference_text, bigbird_pegasus_summary)
    rouge_scores_t5 = calculate_rouge_scores(reference_text, t5_summary)
    rouge_scores_pegasus = calculate_rouge_scores(reference_text, pegasus_summary)

    # Print ROUGE scores
    print("\nROUGE Scores for BART:")
    print("ROUGE-1 Precision:", rouge_scores_bart['rouge1'].precision)
    print("ROUGE-2 Precision:", rouge_scores_bart['rouge2'].precision)
    print("ROUGE-L Precision:", rouge_scores_bart['rougeL'].precision)

    print("\nROUGE Scores for BigBird Pegasus:")
    print("ROUGE-1 Precision:", rouge_scores_bigbird_pegasus['rouge1'].precision)
    print("ROUGE-2 Precision:", rouge_scores_bigbird_pegasus['rouge2'].precision)
    print("ROUGE-L Precision:", rouge_scores_bigbird_pegasus['rougeL'].precision)

    print("\nROUGE Scores for T5:")
    print("ROUGE-1 Precision:", rouge_scores_t5['rouge1'].precision)
    print("ROUGE-2 Precision:", rouge_scores_t5['rouge2'].precision)
    print("ROUGE-L Precision:", rouge_scores_t5['rougeL'].precision)

    print("\nROUGE Scores for Pegasus:")
    print("ROUGE-1 Precision:", rouge_scores_pegasus['rouge1'].precision)
    print("ROUGE-2 Precision:", rouge_scores_pegasus['rouge2'].precision)
    print("ROUGE-L Precision:", rouge_scores_pegasus['rougeL'].precision)


Attention type 'block_sparse' is not possible if sequence_length: 150 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of PegasusForCondit

Original Document:

    Adani is also among the most controversial of India’s billionaires for his association with the Bharatiya Janata Party (BJP). His close relationship with the party is not coincidental: Adani frequently refers to his business strategy as motivated by “nation building,” which the Adani Group describes on its website as “helping build world-class infrastructure capabilities to help accelerate the growth of India.” Mundra Port and its associated Adani Special Economic Zone, the central components of Adani’s business empire, were attained and developed in cooperation with the Gujarat state government. The BJP led the Gujarat state government during key moments of the Adani Group’s growth, and the relationship resulted in the symbiotic rise of both the BJP and the Adani Group.
    

BART Summary:
Adani is also among the most controversial of India’s billionaires for his association with the Bharatiya Janata Party (BJP) His close relationship with the party is not coin