# setup

In [None]:
!pip install -q transformers datasets evaluate
!pip install -q tokenizers
!pip install -q --upgrade accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m109.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m130.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.7 MB/s

In [None]:
import pandas as pd
import numpy as np
import torch

# dataset

In [None]:
data_mix1 = pd.read_csv('complete_data_mix1.csv')
data_mix2 = pd.read_csv('complete_data_mix2.csv')
data = pd.concat([data_mix1, data_mix2], axis=0)
data.head()

Unnamed: 0,sub,summary
0,"- In a perfect world, the relationship between...",- Public opinion should reflect individual pri...
1,This is my first narrative feature. It's calle...,- First narrative feature called Every Secret ...
2,"The emotional affair, as opposed to the physic...",- Emotional affair is being emotionally invest...
3,- You might think that 'collective illusions' ...,- Collective illusions harm social trust\n- On...
4,There’s 21 strategies in habit change and they...,- There are 21 strategies for habit change\n- ...


In [None]:
from datasets import Dataset
custom_dataset = Dataset.from_pandas(data)
custom_dataset

Dataset({
    features: ['sub', 'summary', '__index_level_0__'],
    num_rows: 4138
})

# Preprocessing

In [None]:
from transformers import AutoTokenizer
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    prefix = "summarize: "
    inputs = [prefix + doc for doc in examples["sub"]]
    model_inputs = tokenizer(inputs, max_length=4096, truncation=True)
    labels = tokenizer(examples["summary"], max_length=2048, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
custom_dataset[0]

{'sub': "- In a perfect world, the relationship between private opinion and public opinion would be basically like a mirror. At its best, public opinion holds a mirror to us, and it reflects exactly who we are. What 'collective illusions' do to that relationship is turn it into a funhouse of mirrors. You're still seeing yourself, but it is distorted in ways that make it almost unrecognizable. A collective illusion is a situation where most people in a group go along with an idea that they don't agree with, simply because they incorrectly believe that most people in the group agree with it. The majority of people in a group believe the majority thinks something that they don't. They lead individuals to make decisions that are contrary to their private values. And as a result, the entire group can end up doing something that almost nobody wants, which is fatal to free society. My think tank, Populace, studies collective illusions, and uses what we call 'private opinion methods,' which ar

In [None]:
preprocess_function(custom_dataset[0]).keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
# create tokenize dataset
tokenized_custom_dataset = custom_dataset.map(preprocess_function, batched=True)
tokenized_custom_dataset = tokenized_custom_dataset.train_test_split(test_size=0.2)

Map:   0%|          | 0/4138 [00:00<?, ? examples/s]

# Evaluate

In [None]:
import evaluate
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Training

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
!git config --global credential.helper store
from huggingface_hub import login
login('', add_to_git_credential=True)

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
!apt install git-lfs -y
!git lfs install

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./BulletBriefT5",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_custom_dataset["train"],
    eval_dataset=tokenized_custom_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/content/./BulletBriefT5 is already a clone of https://huggingface.co/Darakarn/BulletBriefT5. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.9052,1.734827,0.1443,0.0637,0.1187,0.1187,19.0
2,1.8613,1.717063,0.1444,0.0641,0.1188,0.119,19.0
3,1.8572,1.71764,0.1438,0.0636,0.118,0.1181,19.0
4,1.8407,1.712846,0.1442,0.064,0.1185,0.1186,19.0
5,1.8364,1.710687,0.1441,0.0639,0.1184,0.1185,19.0


Several commits (21) will be pushed upstream.
Several commits (22) will be pushed upstream.
Several commits (23) will be pushed upstream.
Several commits (24) will be pushed upstream.
Several commits (25) will be pushed upstream.
Several commits (26) will be pushed upstream.
Several commits (27) will be pushed upstream.
Several commits (28) will be pushed upstream.
Several commits (29) will be pushed upstream.
Several commits (30) will be pushed upstream.


TrainOutput(global_step=5175, training_loss=1.8574653308518267, metrics={'train_runtime': 1824.8702, 'train_samples_per_second': 5.669, 'train_steps_per_second': 2.836, 'total_flos': 3431755795464192.0, 'train_loss': 1.8574653308518267, 'epoch': 5.0})