# Data preprocessing

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import datasets
import pandas as pd

from datasets import Dataset
from tqdm import tqdm
from underthesea import sent_tokenize as sent_tokenize_uts, word_tokenize as word_tokenize_uts

def sent_tokenize(doc):
    return sent_tokenize_uts(doc)

def word_tokenize(text, format='list'):
    return word_tokenize_uts(text, format=format)


def load_data_as_df(dir):
    res = {
      'name': [],
      'title': [],
      'summary': [],
      'body': [],
      'img_caption': [],
    }
    for file_name in tqdm(os.listdir(dir)[:100]):
        file = os.path.join(dir, file_name)
        try:
            with open(file, encoding='utf-8') as f:
                document = f.read().rstrip().split("\n\n")
            for i in range(len(document)):
                document[i] = document[i].replace('\n', ' ')
                
            if len(document) == 3:
                title, summary, doc = document 
                img_caption = ''
            elif len(document) == 4:
                title, summary, doc, img_caption = document 
            res['name'].append(file_name)
            res['title'] += [title]
            res['summary'] += [summary]
            res['body'] += [doc]
            res['img_caption'] += [img_caption]
        except:
            continue
      
    return pd.DataFrame.from_dict(res)

valid_data_df = load_data_as_df('../data/val_tokenized')
train_data_df = load_data_as_df('../data/train_tokenized')

valid_data = Dataset.from_pandas(valid_data_df)
train_data = Dataset.from_pandas(train_data_df)

2021-10-11 17:12:52.631422: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64
2021-10-11 17:12:52.631468: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
100%|██████████| 100/100 [00:00<00:00, 19491.17it/s]
100%|██████████| 100/100 [00:00<00:00, 17786.79it/s]


In [2]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('Zayt/viRoberta-l6-h384-word-cased')

In [3]:
sample_size = 1000

def map_to_length(x):
    x["article_len"] = len(tokenizer(x["body"]).input_ids)
    x["article_longer_512"] = int(x["article_len"] > 512)
    x["summary_len"] = len(tokenizer(x["summary"]).input_ids)
    x["summary_longer_64"] = int(x["summary_len"] > 64)
    x["summary_longer_128"] = int(x["summary_len"] > 128)
    return x

data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)

def compute_and_print_stats(x):
  if len(x["article_len"]) == sample_size:
    print(
        "Article Mean: {}, %-Articles > 512:{}, Summary Mean:{}, %-Summary > 64:{}, %-Summary > 128:{}".format(
            sum(x["article_len"]) / sample_size,
            sum(x["article_longer_512"]) / sample_size, 
            sum(x["summary_len"]) / sample_size,
            sum(x["summary_longer_64"]) / sample_size,
            sum(x["summary_longer_128"]) / sample_size,
        )
    )

output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

Article Mean: 729.34, %-Articles > 512:0.6, Summary Mean:52.42, %-Summary > 64:0.16, %-Summary > 128:0.0


In [4]:
encoder_max_length=512
decoder_max_length=128

def process_data_to_model_inputs(batch, tokenizer=tokenizer):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["body"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [5]:
# batch_size = 16
batch_size=4

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["name", 'title', 'summary', 'body', 'img_caption']
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

valid_data = valid_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["name", 'title', 'summary', 'body', 'img_caption']
)
valid_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

# Model

In [6]:
from transformers import EncoderDecoderModel

In [7]:
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("Zayt/viRoberta-l6-h384-word-cased", "Zayt/viRoberta-l6-h384-word-cased")

Some weights of the model checkpoint at Zayt/viRoberta-l6-h384-word-cased were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Zayt/viRoberta-l6-h384-word-cased and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [8]:
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size

bert2bert.config.max_length = 130
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

# Training

In [10]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True, 
    output_dir="./save/checkpoint",
    logging_steps=2,
    save_steps=15,
    eval_steps=15,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2", "rouge1", "rougeL"])

    return {
        "rouge1_fmeasure": round(rouge_output['rouge1'].mid.fmeasure, 4),
        "rouge2_fmeasure": round(rouge_output['rouge2'].mid.fmeasure, 4),
        "rougeL_fmeasure": round(rouge_output['rougeL'].mid.fmeasure, 4),
    }

In [11]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data,
)
trainer.train()

Using amp fp16 backend
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 75


Step,Training Loss,Validation Loss,Rouge1 Fmeasure,Rouge2 Fmeasure,Rougel Fmeasure
15,5.9098,5.620352,0.1865,0.0171,0.1338


***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
Saving model checkpoint to ./save/checkpoint/checkpoint-15
Configuration saved in ./save/checkpoint/checkpoint-15/config.json
Model weights saved in ./save/checkpoint/checkpoint-15/pytorch_model.bin
tokenizer config file saved in ./save/checkpoint/checkpoint-15/tokenizer_config.json
Special tokens file saved in ./save/checkpoint/checkpoint-15/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4


KeyboardInterrupt: 

In [None]:
%tensorboard

# Test

In [7]:

# # Data preprocessing

# %%
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import datasets
import pandas as pd
import random
random.seed(22)

from datasets import Dataset
from tqdm import tqdm
from underthesea import sent_tokenize as sent_tokenize_uts, word_tokenize as word_tokenize_uts

def sent_tokenize(doc):
    return sent_tokenize_uts(doc)

def word_tokenize(text, format='list'):
    return word_tokenize_uts(text, format=format)


def load_data_as_df(dir):
    res = {
      'name': [],
      'title': [],
      'summary': [],
      'body': [],
      'img_caption': [],
    }
    for file_name in tqdm(os.listdir(dir)):
        file = os.path.join(dir, file_name)
        try:
            with open(file, encoding='utf-8') as f:
                document = f.read().rstrip().split("\n\n")
            for i in range(len(document)):
                document[i] = document[i].replace('\n', ' ')
                
            if len(document) == 3:
                title, summary, doc = document 
                img_caption = ''
            elif len(document) == 4:
                title, summary, doc, img_caption = document 
            res['name'].append(file_name)
            res['title'] += [title]
            res['summary'] += [summary]
            res['body'] += [doc]
            res['img_caption'] += [img_caption]
        except:
            continue
      
    return pd.DataFrame.from_dict(res)

test_data_df = load_data_as_df('../data/test_tokenized')

# test_data = Dataset.from_pandas(test_data_df.sample(n=5000, random_state=222))
test_data = Dataset.from_pandas(test_data_df)


# %%
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('Zayt/viRoberta-l6-h384-word-cased')


# %%
encoder_max_length=512
decoder_max_length=128

def process_data_to_model_inputs(batch, tokenizer=tokenizer):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["body"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch


# %%
# batch_size = 16
batch_size=8

test_data = test_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["name", 'title', 'summary', 'body', 'img_caption']
)
test_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# %% [markdown]
# # Model

# %%
from transformers import EncoderDecoderModel


# %%
# bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("Zayt/viRoberta-l6-h384-word-cased", "Zayt/viRoberta-l6-h384-word-cased")
bert2bert = EncoderDecoderModel.from_pretrained('save/checkpoint/checkpoint-9882')


# %%
bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.eos_token_id = tokenizer.sep_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id
bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size

bert2bert.config.max_length = 130
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

# %% [markdown]
# # Training

# %%
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    output_dir="./save/test_result",
    per_device_eval_batch_size = 8,
    dataloader_num_workers = 4,
# device = 'cuda:1',
    # do_train = True,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    warmup_ratio = 0.1,
    seed = 22,
    log_level='info',
    logging_strategy = "steps",
    logging_steps = 50,
    save_total_limit = 4,
    load_best_model_at_end=False,
)

rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2", "rouge1", "rougeL"])

    return {
        "rouge1_fmeasure": round(rouge_output['rouge1'].mid.fmeasure, 4),
        "rouge2_fmeasure": round(rouge_output['rouge2'].mid.fmeasure, 4),
        "rougeL_fmeasure": round(rouge_output['rougeL'].mid.fmeasure, 4),
    }


# %%
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    # train_dataset=train_data,
    eval_dataset=test_data,
)
eval_res = trainer.evaluate()


100%|██████████| 22644/22644 [00:01<00:00, 17315.08it/s]
loading file https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased/resolve/main/vocab.json from cache at /mnt/disk1/.cache/torch/transformers/b0e30d4e65f5ace2978bfc0ac11349b3d7a6e99612a56d3c6a2b1b101a8c1bea.91e4c69b824b5019ca75d7df6f45ac3bc2a7c105ba0783921c380145ab741b43
loading file https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased/resolve/main/merges.txt from cache at /mnt/disk1/.cache/torch/transformers/217d130a1f02d57e1ef004e1fa642b58048d74fae05139dffb70aaa250cd23f5.310710ce1c51d834d45706c163dd9bbbcebc1a68cfe7aefb469cc4eb810051fe
loading file https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased/resolve/main/tokenizer.json from cache at /mnt/disk1/.cache/torch/transformers/4ffeb3a7a3adab541dec5435b618435910b86a0e5e575fa22cf057c3807080f0.c7aa177925a71126a79c08a0ae442213314561ffca32d063a268786ab2cca2b6
loading file https://huggingface.co/Zayt/viRoberta-l6-h384-word-cased/resolve/main/added_tokens.json from cache at

  0%|          | 0/625 [00:00<?, ?ba/s]

loading configuration file save/checkpoint/checkpoint-9882/config.json
Model config EncoderDecoderConfig {
  "architectures": [
    "EncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "Zayt/viRoberta-l6-h384-word-cased",
    "add_cross_attention": true,
    "architectures": [
      "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 384,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_range": 0.02,
    "intermediate_size": 1536,
    "is_decoder": true,
    "is_e

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
eval_res

{'eval_loss': 3.181546449661255,
 'eval_rouge1_fmeasure': 0.5256,
 'eval_rouge2_fmeasure': 0.1606,
 'eval_rougeL_fmeasure': 0.2999,
 'eval_runtime': 5183.5734,
 'eval_samples_per_second': 4.368,
 'eval_steps_per_second': 0.546}

In [8]:
eval_res

{'eval_loss': 3.020521402359009,
 'eval_rouge1_fmeasure': 0.5375,
 'eval_rouge2_fmeasure': 0.1751,
 'eval_rougeL_fmeasure': 0.3076,
 'eval_runtime': 1105.5507,
 'eval_samples_per_second': 4.523,
 'eval_steps_per_second': 0.565}