In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# !pwd
import os, sys
os.chdir('/content/drive/My Drive/ĐATN')
sys.path.append('/content/drive/My Drive/ĐATN')

In [None]:

!pip3 install sacremoses fastBPE  subword_nmt
# !git clone https://github.com/pytorch/fairseq
!cd fairseq && sudo pip3 install --editable ./


In [None]:
import glob, random
import pandas as pd
import concurrent.futures
import numpy as np


In [None]:
%%capture
!pip install datasets==1.0.2
!pip install transformers
import datasets
import transformers

In [None]:
train_df = pd.read_csv('./vietnews-master/data/train_tokenized.csv')
val_df = pd.read_csv('./vietnews-master/data/val_tokenized.csv')

print(len(train_df), len(val_df))

In [None]:
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast,AutoTokenizer
import datasets
from datasets import *
train_data =  Dataset.from_pandas(train_df)
val_data =  Dataset.from_pandas(val_df)
test_data =  Dataset.from_pandas(test_df)

In [None]:
batch_size=16
encoder_max_length=256
decoder_max_length=64
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
def process_data_to_model_inputs(batch):                                                               
    # Tokenizer format: [BOS] <text> [EOS]                                               
    inputs = tokenizer(batch["original"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)
    batch["input_ids"] = inputs.input_ids                                                               
    batch["attention_mask"] = inputs.attention_mask                                                     
    batch["decoder_input_ids"] = outputs.input_ids                                                      
    batch["labels"] = outputs.input_ids.copy()    
    # mask loss for padding                                                                             
    batch["labels"] = [                                                                                 
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]                     
    batch["decoder_attention_mask"] = outputs.attention_mask                                                                              
                                                                                                         
    return batch  

In [None]:
train_batch = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["file","original", "summary"],
)
train_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

validation_batch = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["file", "original", "summary"],
)
validation_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


In [None]:
!export CUDA_LAUNCH_BLOCKING=1
import torch.nn as nn
import torch
import os
import numpy as np
from torch.utils import checkpoint
from random import random
from transformers import RobertaConfig, RobertaModel, AutoModel,AutoModelForCausalLM,EncoderDecoderModel
from transformers import  AutoConfig,AutoModel, EncoderDecoderModel, EncoderDecoderConfig

encoder_config = AutoConfig.from_pretrained("vinai/phobert-base")
encoder = AutoModel.from_pretrained("vinai/phobert-base")
decoder_config = RobertaConfig.from_pretrained("vinai/phobert-base")

decoder = AutoModelForCausalLM.from_config(decoder_config)
encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
encoder_decoder_config.tie_encoder_decoder = True
bert2rnd = EncoderDecoderModel(encoder = encoder, decoder = decoder, config = encoder_decoder_config)
 


In [None]:
bert2rnd.config.decoder_start_token_id = tokenizer.bos_token_id                                             
bert2rnd.config.eos_token_id = tokenizer.eos_token_id                        
bert2rnd.config.max_length = 64
bert2rnd.config.early_stopping = True
bert2rnd.config.no_repeat_ngram_size = 3
bert2rnd.config.length_penalty = 2.0
bert2rnd.config.num_beams = 4
bert2rnd.config.vocab_size = bert2rnd.config.encoder.vocab_size

In [None]:
!pip3 install git-python==1.0.3
!pip3 install sacrebleu==1.4.12
!pip3 install rouge_score

In [None]:
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional
import datasets
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_res = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge_p": round(rouge_res.precision, 4),
        "rouge_r": round(rouge_res.recall, 4),
        "rouge_f": round(rouge_res.fmeasure, 4),
    }

In [None]:
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [None]:
!mkdir 'bert-model'
from seq2seq_trainer import Seq2SeqTrainer
OUTPUT_DIR = './bert-model/'
training_args = Seq2SeqTrainingArguments(
    output_dir= OUTPUT_DIR,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=200,  
    save_steps=1500, 
    eval_steps=7500, 
    warmup_steps=3000,  
    num_train_epochs=10, 
    overwrite_output_dir=True,
    save_total_limit=50,
)
# train
trainer = Seq2SeqTrainer(
    model=bert2rnd,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_batch,
    eval_dataset=validation_batch,
)
trainer.train()