# Text Summarization using BART Transformer

In [None]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
# LOADING THE DATASET
from datasets import load_dataset

ds = load_dataset("knkarthick/dialogsum")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
ds['train'][1]['dialogue']

"#Person1#: Hello Mrs. Parker, how have you been?\n#Person2#: Hello Dr. Peters. Just fine thank you. Ricky and I are here for his vaccines.\n#Person1#: Very well. Let's see, according to his vaccination record, Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due for Hepatitis A, Chickenpox and Measles shots.\n#Person2#: What about Rubella and Mumps?\n#Person1#: Well, I can only give him these for now, and after a couple of weeks I can administer the rest.\n#Person2#: OK, great. Doctor, I think I also may need a Tetanus booster. Last time I got it was maybe fifteen years ago!\n#Person1#: We will check our records and I'll have the nurse administer and the booster as well. Now, please hold Ricky's arm tight, this may sting a little."

In [None]:
ds['train'][1]['summary']

'Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vaccine.'

### WITHOUT FINE - TUNING

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
article_1 = ds['train'][1]['dialogue']

In [None]:
pipe(article_1, max_length=20, min_length=10, do_sample=False)

[{'summary_text': 'Ricky has received his Polio, Tetanus and Hepatitis B shots.'}]

In [None]:
ds['train'][1]['summary']

'Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vaccine.'

### WITH FINE - TUNING

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [None]:
# tokenization
def preprocess_function(batch):
  source = batch['dialogue']
  target = batch['summary']
  source_ids = tokenizer(source, padding='max_length', truncation=True, max_length=128)
  target_ids = tokenizer(source, padding='max_length', truncation=True, max_length=128)

  labels = target_ids['input_ids']
  labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

  return{
          'input_ids': source_ids['input_ids'],
          'attention_mask': source_ids['attention_mask'],
          'labels': labels
      }

In [None]:
df_source = ds.map(preprocess_function, batched=True, batch_size=1000, num_proc=4)

Map (num_proc=4):   0%|          | 0/12460 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# training arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='/content',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,
    remove_unused_columns=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df_source['train'],
    eval_dataset=df_source['test']
)


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mosenisamuel698[0m ([33mosenisamuel698-lincoln-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.011
1000,0.0036
1500,0.0027
2000,0.0015
2500,0.0008
3000,0.0005




TrainOutput(global_step=3116, training_loss=0.003221724620531283, metrics={'train_runtime': 3413.243, 'train_samples_per_second': 7.301, 'train_steps_per_second': 0.913, 'total_flos': 6750530835578880.0, 'train_loss': 0.003221724620531283, 'epoch': 2.0})

In [None]:
eval_results = trainer.evaluate()

In [None]:
eval_results

{'eval_loss': 0.015150470659136772,
 'eval_runtime': 47.6185,
 'eval_samples_per_second': 31.5,
 'eval_steps_per_second': 3.948,
 'epoch': 2.0}

## SAVING THE MODEL

In [None]:
model.save_pretrained('/content/model_directory')
tokenizer.save_pretrained('/content/model_directory')

('/content/model_directory/tokenizer_config.json',
 '/content/model_directory/special_tokens_map.json',
 '/content/model_directory/vocab.json',
 '/content/model_directory/merges.txt',
 '/content/model_directory/added_tokens.json',
 '/content/model_directory/tokenizer.json')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/content/model_directory')
model = AutoModelForSeq2SeqLM.from_pretrained('/content/model_directory')

def summarize(blog_post):
  #Tokenize the input blog post
  inputs = tokenizer(blog_post, max_length = 1024, truncation = True, return_tensors = 'pt' )

  #Generate the summary
  summary_ids = model.generate(inputs['input_ids'], max_length = 150, num_beams = 4,  no_repeat_ngram_size=2, min_length = 40, early_stopping = True)

  #Decode the summary
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens= True)

  return summary



In [None]:
blog_post = """
Presidents aren’t supposed to direct IRS investigations
US law specifically prohibits presidents from directing the IRS to investigate anyone in a section entitled: “Prohibition on executive branch influence over taxpayer audits and other investigations.”

While the IRS falls under the Treasury Department, it’s important that it be as protected from politics as possible. That’s why the IRS has only two politically appointed officials, according to Mark Mazur, who was assistant secretary of treasury for tax policy at the outset of the Biden administration

The US has higher voluntary tax payment rates than other countries, Mazur told me, “because people feel that their interactions with the tax system are fair and based on law.”

If the IRS is suddenly used for political purposes, that trust could be destroyed. During the Obama administration, for instance, the IRS became embroiled in a bona fide scandal when a Treasury Department investigation found the IRS delayed conferring tax-exempt status on conservative groups.

If the IRS did find that its tax-exempt status should be revoked, Harvard would need to be warned and given an opportunity to contest the finding. It would also have the opportunity to challenge the IRS in court.

There is already a lot of chaos at the IRS under the new Trump administration. Multiple acting commissioners have resigned, apparently the result a standoff over whether tax data could be used by immigration officials.

It would not be unprecedented for a university to lose its tax-exempt status
Back in 1983, the Supreme Court agreed that Bob Jones University should not be tax-exempt because, at the time, it banned interracial relationships among its students.

The university didn’t drop its interracial marriage policy until 2000 — in an announcement on CNN’s Larry King Live, coincidentally — although it did not regain its tax-exempt status until 2017.

The US has now come full circle to the point that one of the main gripes Trump has with Harvard is its diversity programs.
"""
summary = summarize(blog_post)
print(f'Summary : {summary}')



Summary : #Presidents aren’n't supposed to direct IRS investigations
US law specifically prohibits presidents from directing the IRS to investigate anyone in a section entitled: “Prohibition on executive branch influence over taxpayer audits and other investigations.”


