In [16]:
import numpy as np
import pandas as pd
import os

import re
import spacy

import torch
import json

import nltk
from nltk.tokenize import sent_tokenize

from datasets import load_dataset

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config


In [17]:
for dirname, _, filenames in os.walk('cnn_dailymail/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

cnn_dailymail/test.csv
cnn_dailymail/train.csv
cnn_dailymail/validation.csv


In [18]:
TEST_FILE_PATH  = 'cnn_dailymail/test.csv'
TRAIN_FILE_PATH = 'cnn_dailymail/train.csv'
VALID_FILE_PATH = 'cnn_dailymail/validation.csv'

In [19]:
train_df = pd.read_csv(TRAIN_FILE_PATH)
test_df  = pd.read_csv(TEST_FILE_PATH)
valid_df = pd.read_csv(VALID_FILE_PATH)

In [23]:
print(f"train_df.shape: {train_df.shape}")
print(f"test_df.shape: {test_df.shape}")
print(f"valid_df.shape: {valid_df.shape}")

train_df.shape: (287113, 3)
test_df.shape: (11490, 3)
valid_df.shape: (13368, 3)


In [24]:
train_df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [25]:
df = train_df
df = df.article.dropna()
df_articles = df.values.tolist()

# Limit the length of df_articles to 500
df_articles = df_articles[:500]

len(df_articles)

500

In [26]:
df.head()

0    By . Associated Press . PUBLISHED: . 14:11 EST...
1    (CNN) -- Ralph Mata was an internal affairs li...
2    A drunk driver who killed a young woman in a h...
3    (CNN) -- With a breezy sweep of his pen Presid...
4    Fleetwood are the only team still to have a 10...
Name: article, dtype: object

In [28]:
T5_PATH = 'Einmalumdiewelt/T5-Base_GNAD'
t5_model = T5ForConditionalGeneration.from_pretrained(T5_PATH, output_past=True)
t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)
x = t5_tokenizer(df_articles, truncation=True, max_length=500)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def t5_summarize(input_text, num_beams=4, num_words=80):
    input_text = ' '.join(input_text.split())
    input_tokenized = t5_tokenizer.encode(input_text, return_tensors="pt").to(device)
    summary_task = torch.tensor([[21603, 10]]).to(device)
    input_tokenized = torch.cat([summary_task, input_tokenized], dim=-1).to(device)

    # Move the model to the same device as input tensors
    t5_model.to(device)

    summary_ids = t5_model.generate(input_tokenized,
                                    num_beams=int(num_beams),
                                    no_repeat_ngram_size=3,
                                    length_penalty=2.0,
                                    min_length=30,
                                    max_length=int(num_words),
                                    early_stopping=True)

    output = [t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) 
              for g in summary_ids]
    return output[0]

In [11]:
%%time
for i in range(6):
    
        print('news articles ',i + 1, " : \n" )
        print(t5_summarize(df_articles[i]))
        print('............................................................................\n\n\n\n')

news articles  1  : 



Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors


Bishop John Folda of the Fargo Catholic Diocese in North Dakota has been diagnosed with hepatitis A. The bishop is taking time off after being diagnosed through contaminated food.
............................................................................




news articles  2  : 

Ralph Mata, an internal affairs lieutenant for the Miami-Dade Police Department, is facing charges of aiding and abetting a conspiracy to distribute cocaine and engaging in monetary transactions in property derived from specified unlawful activity. The 45-year-old is scheduled to appear in federal court on Wednesday.
............................................................................




news articles  3  : 

Craig Eccleston-Todd, 27, was reading or replying to a text message when he veered across the road while driving round a bend and smashed into the car of Rachel Titley, 28. The solicitor's clerk died later from her injuries.
......................................................................

In [40]:
def summarize_text(text):
    # Generate summary using the T5 model
    summary = t5_summarize(text)
    
    # Show the generated summary
    print("\nSummary:")
    print(summary)

# Prompt the user for input and generate summary
user_input = input("Enter the text to summarize: ")
summarize_text(user_input)

Enter the text to summarize: The first Batman story, "The Case of the Chemical Syndicate", was published in Detective Comics #27 (cover dated May 1939). It was inspired, some say plagiarized, by the 60 page story “Partners of Peril” in The Shadow #113, which was written by Theodore Tinsley and illustrated by Tom Lovell.[21] Finger said, "Batman was originally written in the style of the pulps",[22] and this influence was evident with Batman showing little remorse over killing or maiming criminals. Batman proved a hit character, and he received his own solo title in 1940 while continuing to star in Detective Comics. By that time, Detective Comics was the top-selling and most influential publisher in the industry; Batman and the company's other major hero, Superman, were the cornerstones of the company's success.[23] The two characters were featured side by side as the stars of World's Finest Comics, which was originally titled World's Best Comics when it debuted in fall 1940. Creators i

In [33]:
from transformers import BartForConditionalGeneration, BartConfig, BartTokenizer

In [34]:
bart_PATH = 'facebook/bart-large-cnn'
bart_model = BartForConditionalGeneration.from_pretrained(bart_PATH, output_past=True)
bart_tokenizer = BartTokenizer.from_pretrained(bart_PATH)
x = bart_tokenizer(df_articles, truncation=True, max_length=500)

In [35]:
def bart_summarize(input_text, num_beams=4, num_words=80):
    input_text = str(input_text)
    input_text = ' '.join(input_text.split())
    input_tokenized = bart_tokenizer.encode(input_text, return_tensors='pt')
    summary_ids = bart_model.generate(input_tokenized,
                                      num_beams=int(num_beams),
                                      no_repeat_ngram_size=3,
                                      length_penalty=2.0,
                                      min_length=80,
                                      max_length=int(num_words),
                                      early_stopping = False)
    output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    return output[0]

In [16]:
%%time
for i in range(2):
    
        print('news article ',i + 1, " : \n" )
        print(bart_summarize(df_articles[i]))
        print('............................................................................\n\n\n\n')

news article  1  : 

Bishop John Folda of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion in late September and early October. The diocese announced on Monday that Bishop Folda is taking time off after
............................................................................




news article  2  : 

Ralph Mata, 45, was an internal affairs lieutenant for the Miami-Dade Police Department. Authorities allege he worked with a drug trafficking organization to help plan a murder plot. The complaint also alleges that Mata used his police badge to purchase weapons for drug traffickers. Mata faces charges of aiding and abetting a conspiracy to distribute cocaine, among other things. He is scheduled to appear in
...............................................................

In [41]:
def summarize_text(text):
    # Generate summary using the bart model
    summary = bart_summarize(text)
    
    # Show the generated summary
    print("\nSummary:")
    print(summary)

# Prompt the user for input and generate summary
user_input = input("Enter the text to summarize: ")
summarize_text(user_input)

Enter the text to summarize: The first Batman story, "The Case of the Chemical Syndicate", was published in Detective Comics #27 (cover dated May 1939). It was inspired, some say plagiarized, by the 60 page story “Partners of Peril” in The Shadow #113, which was written by Theodore Tinsley and illustrated by Tom Lovell.[21] Finger said, "Batman was originally written in the style of the pulps",[22] and this influence was evident with Batman showing little remorse over killing or maiming criminals. Batman proved a hit character, and he received his own solo title in 1940 while continuing to star in Detective Comics. By that time, Detective Comics was the top-selling and most influential publisher in the industry; Batman and the company's other major hero, Superman, were the cornerstones of the company's success.[23] The two characters were featured side by side as the stars of World's Finest Comics, which was originally titled World's Best Comics when it debuted in fall 1940. Creators i

In [1]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [2]:
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

Downloading (…)neration_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

In [9]:
text = """
The first Batman story, "The Case of the Chemical Syndicate", was published in Detective Comics #27 (cover dated May 1939). It was inspired, some say plagiarized, by the 60 page story “Partners of Peril” in The Shadow #113, which was written by Theodore Tinsley and illustrated by Tom Lovell.[21] Finger said, "Batman was originally written in the style of the pulps",[22] and this influence was evident with Batman showing little remorse over killing or maiming criminals. Batman proved a hit character, and he received his own solo title in 1940 while continuing to star in Detective Comics. By that time, Detective Comics was the top-selling and most influential publisher in the industry; Batman and the company's other major hero, Superman, were the cornerstones of the company's success.[23] The two characters were featured side by side as the stars of World's Finest Comics, which was originally titled World's Best Comics when it debuted in fall 1940. Creators including Jerry Robinson and Dick Sprang also worked on the strips during this period.  Over the course of the first few Batman strips elements were added to the character and the artistic depiction of Batman evolved. Kane noted that within six issues he drew the character's jawline more pronounced, and lengthened the ears on the costume. "About a year later he was almost the full figure, my mature Batman", Kane said.[24] Batman's characteristic utility belt was introduced in Detective Comics #29 (July 1939), followed by the boomerang-like batarang and the first bat-themed vehicle, the Batplane, in #31 (September 1939). The character's origin was revealed in #33 (November 1939), unfolding in a two-page story that establishes the brooding persona of Batman, a character driven by the death of his parents. Written by Finger, it depicts a young Bruce Wayne witnessing his parents' murder at the hands of a mugger. Days later, at their grave, the child vows that "by the spirits of my parents [I will] avenge their deaths by spending the rest of my life warring on all criminals".[25][26][27]  The early, pulp-inflected portrayal of Batman started to soften in Detective Comics #38 (April 1940) with the introduction of Robin, Batman's junior counterpart.[28] Robin was introduced, based on Finger's suggestion, because Batman needed a "Watson" with whom Batman could talk.[29] Sales nearly doubled, despite Kane's preference for a solo Batman, and it sparked a proliferation of "kid sidekicks".[30] The first issue of the solo spin-off series Batman was notable not only for introducing two of his most persistent enemies, the Joker and Catwoman, but for a pre-Robin inventory story, originally meant for Detective Comics #38, in which Batman shoots some monstrous giants to death.[31][32] That story prompted editor Whitney Ellsworth to decree that the character could no longer kill or use a gun.[33]  By 1942, the writers and artists behind the Batman comics had established most of the basic elements of the Batman mythos.[34] In the years following World War II, DC Comics "adopted a postwar editorial direction that increasingly de-emphasized social commentary in favor of lighthearted juvenile fantasy". The impact of this editorial approach was evident in Batman comics of the postwar period; removed from the "bleak and menacing world" of the strips of the early 1940s, Batman was instead portrayed as a respectable citizen and paternal figure that inhabited a "bright and colorful" environment."""

In [10]:
tokens = tokenizer(text, truncation = True, padding = "longest", return_tensors = "pt")

In [11]:
tokens

{'input_ids': tensor([[  139,   211, 12137,   584,   108,   198,   159,  6174,   113,   109,
         12385, 52186,   194,   108,   140,  1299,   115, 19886, 16908,  1768,
          9613,   143, 20473,  8922,   913, 23259,   250,   168,   140,  2261,
           108,   181,   416, 68033,   108,   141,   109,  1790,   438,   584,
           185, 59237,   113,  4957,  4935,   227,   115,   139, 12862,  5635,
          5517,   108,   162,   140,  1158,   141, 32532, 13431, 14364,   111,
          9789,   141,  3227, 62197,   107,  4101,  7090,  1100, 25071,   243,
           108,   198, 67271,   140,  3273,  1158,   115,   109,   669,   113,
           109, 15052,   116,   194,   108,  4101,  8101,  1100,   111,   136,
          2581,   140,  8583,   122, 12137,  1986,   332, 40215,   204,  5730,
           132, 38294, 12582, 14863,   107, 12137,  5343,   114,  1194,  1510,
           108,   111,   178,   915,   169,   282,  4760,  1560,   115, 11324,
           277,  3712,   112,  2187,  

In [12]:
summary = model.generate(**tokens)

In [13]:
summary[0]

tensor([    0, 12137,   211,  2893,   115, 19886, 16908,   115, 23259,   107,
            1])

In [14]:
tokenizer.decode(summary[0])

'<pad>Batman first appeared in Detective Comics in 1939.</s>'

In [52]:
from transformers import pipeline, set_seed
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\om\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [53]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")

print(f"Features in cnn_dailymail : {dataset['train'].column_names}")



  0%|          | 0/3 [00:00<?, ?it/s]

Features in cnn_dailymail : ['id', 'article', 'highlights']


In [54]:
sample_text = dataset["train"][1]["article"][:1000]

# We'll collect the generated summaries of each model in a dictionary
summaries = {}

In [69]:
def baseline_summary_three_sent(text):
    return "\n".join(sent_tokenize(text)[:3])

In [55]:
pipe = pipeline('summarization', model = 't5-small' )

pipe_out = pipe(sample_text)

pipe_out

Downloading (…)lve/main/config.json: 0.00B [00:00, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json: 0.00B [00:00, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

[{'summary_text': 'a criminal complaint accuses the 45-year-old of using his role as a police officer . he worked with a drug trafficking organization to plan a murder plot and get guns . the complaint alleges that he arranged to pay two assassins to kill rival drug dealers .'}]

In [56]:
summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

In [57]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")

pipe_out = pipe(sample_text)

pipe_out

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

[{'summary_text': 'Ralph Mata was an internal affairs lieutenant for the Miami-Dade Police Department. Authorities allege that the 45-year-old longtime officer worked with a drug trafficking organization. A criminal complaint unsealed in U.S. District Court in New Jersey accuses Mata of using his role as a police officer.'}]

In [58]:
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

'Ralph Mata was an internal affairs lieutenant for the Miami-Dade Police Department.\nAuthorities allege that the 45-year-old longtime officer worked with a drug trafficking organization.\nA criminal complaint unsealed in U.S. District Court in New Jersey accuses Mata of using his role as a police officer.'

In [62]:
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])


for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])

GROUND TRUTH
Criminal complaint: Cop used his role to help cocaine traffickers .
Ralph Mata, an internal affairs lieutenant, allegedly helped group get guns .
He also arranged to pay two assassins in a murder plot, a complaint alleges .
T5
a criminal complaint accuses the 45-year-old of using his role as a police officer .nhe worked with a drug trafficking organization to plan a murder plot and get guns .nthe complaint alleges that he arranged to pay two assassins to kill rival drug dealers .
BART
Ralph Mata was an internal affairs lieutenant for the Miami-Dade Police Department.
Authorities allege that the 45-year-old longtime officer worked with a drug trafficking organization.
A criminal complaint unsealed in U.S. District Court in New Jersey accuses Mata of using his role as a police officer.


In [15]:
from datasets import load_metric

bleu_metric = load_metric("sacrebleu")

  This is separate from the ipykernel package so we can avoid doing imports until


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [64]:
bleu_metric.add(prediction = [summaries["t5"]], reference = [dataset['train'][1]['highlights'] ])

results = bleu_metric.compute(smooth_method = 'floor', smooth_value = 0 )

results['precision'] = [np.round(p , 2) for p in results['precisions'] ]

pd.DataFrame.from_dict(results, orient = 'index', columns = ['Value'] )

Unnamed: 0,Value
score,13.245629
counts,"[20, 10, 5, 2]"
totals,"[52, 51, 50, 49]"
precisions,"[38.46153846153846, 19.607843137254903, 10.0, ..."
bp,1.0
sys_len,52
ref_len,42
precision,"[38.46, 19.61, 10.0, 4.08]"


In [65]:
rouge_metric = load_metric('rouge')

In [66]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

reference = dataset['train'][1]['highlights']

records = []

for model_name in summaries:
    rouge_metric.add(prediction = summaries[model_name], reference = reference )
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
    print('rouge_dict ', rouge_dict )
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index = summaries.keys() )

rouge_dict  {'rouge1': 0.46913580246913583, 'rouge2': 0.25316455696202533, 'rougeL': 0.32098765432098764, 'rougeLsum': 0.32098765432098764}
rouge_dict  {'rouge1': 0.30588235294117644, 'rouge2': 0.14457831325301204, 'rougeL': 0.2117647058823529, 'rougeLsum': 0.2823529411764706}


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
t5,0.469136,0.253165,0.320988,0.320988
bart,0.305882,0.144578,0.211765,0.282353


In [67]:
def calculate_metric_on_baseline_test_ds(dataset, metric, column_text = 'article', column_summary = 'highlights' ):
    """
    This function calculates a specified metric on a baseline test dataset for a Natural Language Processing (NLP) task.
    It assumes the task is a text summarization task, where the goal is to generate a summary (e.g., highlights) from a text (e.g., article).

    Parameters:
    dataset (pandas.DataFrame): The test dataset. It should contain a column for the text and a column for the true summary.
    metric (datasets.Metric): The metric to calculate. This should be a metric object from the Hugging Face datasets library.
    column_text (str, optional): The name of the column in the dataset that contains the text. Defaults to 'article'.
    column_summary (str, optional): The name of the column in the dataset that contains the true summary. Defaults to 'highlights'.

    Returns:
    score (float): The calculated score of the metric on the test dataset.
    """
    summaries = [baseline_summary_three_sent(text) for text in dataset[column_text] ]

    metric.add_batch(predictions = summaries, references = dataset[column_summary] )

    score = metric.compute()
    return score

In [71]:
test_sampled = dataset['test'].shuffle(seed = 42).select(range(1000))

score = calculate_metric_on_baseline_test_ds(test_sampled, rouge_metric )

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame.from_dict(rouge_dict, orient = 'index' , columns = ['baseline'] ).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.390057,0.171891,0.245931,0.356057
