In [1]:
# !pip install transformers
# !pip install datasets
# !pip install sentencepiece
# !pip install rouge_score
# !pip install wandb

In [2]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

2023-07-13 12:40:51.498808: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb
    wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mrmk[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
language = 'english'
# language = 'french'

In [5]:
model_name = "sshleifer/distilbart-xsum-12-3"
if language == 'french':
    model_name = "moussaKam/barthez-orangesum-abstract"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# print(model.config)

encoder_max_length = 256
decoder_max_length = 64

In [6]:
data = datasets.load_dataset("wiki_lingua", name=language, split="train[:2000]")

for k, v in data['article'][0].items():
    print(k)
    print(v)

Found cached dataset wiki_lingua (/Users/rajithamuthukrishnan/.cache/huggingface/datasets/wiki_lingua/english/1.1.1/6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e)


section_name
['Finding Other Transportation', 'Designating a Driver', 'Staying Safe']
document
['make sure that the area is a safe place, especially if you plan on walking home at night.  It‚Äôs always a good idea to practice the buddy system.  Have a friend meet up and walk with you. Research the bus, train, or streetcar routes available in your area to find safe and affordable travel to your destination.  Make sure you check the schedule for your outgoing and return travel.  Some public transportation will cease to run late at night.  Be sure if you take public transportation to the venue that you will also be able to get home late at night. Check the routes.  Even if some public transit is still running late at night, the routing may change.  Some may run express past many of the stops, or not travel all the way to the ends.  Be sure that your stop will still be available when you need it for your return trip. If you are taking public transit in a vulnerable state after drinking, it

In [7]:
def flatten(example):
    return {
        'document': example['article']['document'],
        'summary': example['article']['summary'],
    }

def list2samples(example):
    documents = []
    summaries = []
    for sample in zip(example['document'], example['summary']):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {'document': documents, 'summary': summaries}

dataset = data.map(flatten, remove_columns=['article', 'url'])
dataset = dataset.map(list2samples, batched=True)

train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()

Loading cached processed dataset at /Users/rajithamuthukrishnan/.cache/huggingface/datasets/wiki_lingua/english/1.1.1/6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e/cache-f8911c97f741fef9.arrow
Loading cached processed dataset at /Users/rajithamuthukrishnan/.cache/huggingface/datasets/wiki_lingua/english/1.1.1/6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e/cache-6b52a63406610f48.arrow


In [8]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch['document'], batch['summary']
    source_tokenized = tokenizer(
        source, padding = 'max_length', truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding = 'max_length', truncation=True, max_length=max_target_length
    )
    
    batch = {k: v for k,v in source_tokenized.items()}
#     Ignore padding in the loss
    batch['labels'] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized['input_ids']
    ]
    return batch

train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/4351 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [9]:
nltk.download('punkt', quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
#     rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  metric = datasets.load_metric("rouge")


In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=1,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [11]:
if WANDB_INTEGRATION:
    wandb_run = wandb.init(
        project="text_summarizer_bart",
        config={
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "dataset": "wiki_lingua " + language,
        },
    )

    now = datetime.now()
    current_time = now.strftime("%H%M%S")
    wandb_run.name = "run_" + language + "_" + current_time

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
trainer.evaluate()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 6.803419589996338,
 'eval_rouge1': 20.4421,
 'eval_rouge2': 4.8615,
 'eval_rougeL': 14.9811,
 'eval_rougeLsum': 18.2294,
 'eval_gen_len': 23.8388,
 'eval_runtime': 1456.9487,
 'eval_samples_per_second': 0.332,
 'eval_steps_per_second': 0.083}

In [13]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples['document'],
        padding='max_length',
        truncation=True,
        max_length=encoder_max_length,
        return_tensors='pt',
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [14]:
# %%wandb
trainer.train()



Step,Training Loss
50,6.554
100,5.5839
150,5.1701
200,4.8444
250,4.7955
300,4.6869
350,4.6339
400,4.6917
450,4.6285
500,4.6361


TrainOutput(global_step=1088, training_loss=4.719895671395695, metrics={'train_runtime': 33157.2285, 'train_samples_per_second': 0.131, 'train_steps_per_second': 0.033, 'total_flos': 1346978041036800.0, 'train_loss': 4.719895671395695, 'epoch': 1.0})

In [15]:
trainer.evaluate()

{'eval_loss': 4.26106071472168,
 'eval_rouge1': 31.7211,
 'eval_rouge2': 11.9106,
 'eval_rougeL': 24.9172,
 'eval_rougeLsum': 30.4376,
 'eval_gen_len': 23.7686,
 'eval_runtime': 1470.2226,
 'eval_samples_per_second': 0.329,
 'eval_steps_per_second': 0.082,
 'epoch': 1.0}

In [16]:
if WANDB_INTEGRATION:
    wandb_run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max‚Ä¶

0,1
eval/gen_len,‚ñà‚ñÅ
eval/loss,‚ñà‚ñÅ
eval/rouge1,‚ñÅ‚ñà
eval/rouge2,‚ñÅ‚ñà
eval/rougeL,‚ñÅ‚ñà
eval/rougeLsum,‚ñÅ‚ñà
eval/runtime,‚ñÅ‚ñà
eval/samples_per_second,‚ñà‚ñÅ
eval/steps_per_second,‚ñà‚ñÅ
train/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà

0,1
eval/gen_len,23.7686
eval/loss,4.26106
eval/rouge1,31.7211
eval/rouge2,11.9106
eval/rougeL,24.9172
eval/rougeLsum,30.4376
eval/runtime,1470.2226
eval/samples_per_second,0.329
eval/steps_per_second,0.082
train/epoch,1.0


In [17]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples['document'],
        padding='max_length',
        truncation=True,
        max_length=encoder_max_length,
        return_tensors='pt',
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)
test_samples = validation_data_txt.select(range(11))
summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]



In [18]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            summaries_before_tuning,
        ),
        headers=["Id", "Summary after", "Summary before"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["document"])), headers=["Id", "Document"]))

  Id  Summary after                                                                                                   Summary before
----  --------------------------------------------------------------------------------------------------------------  -----------------------------------------------------------------------------------------------------------------------------------------------
   0  Set goals. Set goals. Work with your child. Keep track of how your child is doing.                              As part of our series of letters from children with cerebral palsy, you can help you to achieve what goals they want to achieve for themselves.
   1  Locate your hands behind your neck to support your neck. Stretch your legs.                                     Try doing yoga to stretch your hamstrings to improve the flexibility of your legs.
   2  Find a way to make a cake. Mix the eggs and milk.                                                               If you want to make a ca

In [31]:
from huggingface_hub import login, logout
access_token_write = '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'
login(token = access_token_write)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/rajithamuthukrishnan/.cache/huggingface/token
Login successful


In [33]:
model.push_to_hub("RajithaMuthukrishnan/text-summariser-english")
tokenizer.push_to_hub("RajithaMuthukrishnan/text-summariser-english")
# # Load model
# from transformers import AutoModel
# model = AutoModel.from_pretrained("RajithaMuthukrishnan/text-summariser-english")

CommitInfo(commit_url='https://huggingface.co/RajithaMuthukrishnan/text-summariser-english/commit/d82da2a021b9991c2f76fea59ea98d4330f4a693', commit_message='Upload tokenizer', commit_description='', oid='d82da2a021b9991c2f76fea59ea98d4330f4a693', pr_url=None, pr_revision=None, pr_num=None)