In [1]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import os
import re

  from pandas.core import (


In [2]:
def normalize_text(paragraph):
    res = []
    for text in paragraph:
        # Loại bỏ ký tự tab, xuống dòng
        text = re.sub(r'[\t\n]', ' ', text)
        # Loại bỏ các cặp {}, [], ()
        text = re.sub(r'[\{\}\[\]\(\)]', ' ', text)
        # Loại bỏ teen code/emoticon kiểu :D, :)), :v, :3, :P, :|, :((, :)), :>, :<, :o, :O, :x, :X, :d, :p, :V, ...
        text = re.sub(r'(:\)|:\(|:D|:P|:p|:v|:V|:3|:o|:O|:x|:X|:\||:>|:<|:\)+|:\(+)', ' ', text)
        # Loại bỏ 2 dấu câu đứng cạnh nhau (giữ lại 1 dấu)
        text = re.sub(r'([.,!?;:\"\']){2,}', r'\1', text)
        # Loại bỏ các dấu câu mà giữa chúng là khoảng trắng (vd: "; .", ", !")
        text = re.sub(r'([.,!?;:\"\'])\s+([.,!?;:\"\'])', r'\2', text)
        # Loại bỏ ký tự không phải chữ cái tiếng Việt, số, dấu câu cơ bản, khoảng trắng
        text = re.sub(r"[^A-Za-zÀ-ỹà-ỹ0-9.,!?;:'\" \-]", '', text)
        # Chuẩn hóa khoảng trắng
        text = re.sub(r'\s+', ' ', text).strip()
        res.append(text)
    return res


In [3]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")

In [4]:
def preprocess_function(examples):
    inputs = normalize_text(examples["inputs"])
    labels = normalize_text(examples["labels"])
    model_inputs = tokenizer(
        inputs, max_length=1024, truncation=True, padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            labels, max_length=1024, truncation=True, padding="max_length"
        )
        # Đổi tất cả pad_token_id thành -100
    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels["input_ids"]
    ]
    model_inputs['labels'] = labels_ids
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [5]:
task1 = 'D:/Downloads/DS_AI/VDT/MoE/moe/data/raw/Wiki'
task2 = 'D:/Downloads/DS_AI/VDT/MoE/moe/data/raw/VNews'

In [13]:
task3 = 'D:/Downloads/DS_AI/VDT/MoE/moe/data/raw/iwslt15'

In [6]:
def create_tokenized_dataset(file_name, Wiki = True, Vnews = True):
    input_lines = []
    label_lines = []
    if Wiki:
        with open(f'{task1}/{file_name}', encoding='utf-8') as file: 
            for i, line in enumerate(file):
                if i == 0:
                    continue 
                line = line.strip().split('\t')
                input_lines.append(line[1])
                label_lines.append(line[2])
    if Vnews:
        with open(f'{task2}/{file_name}', encoding='utf-8') as file: 
            for i, line in enumerate(file):
                if i == 0:
                    continue 
                line = line.strip().split('\t')
                input_lines.append(line[3])
                label_lines.append(line[2]) 
    dict_obj = {'inputs': input_lines, 'labels': label_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched = True, remove_columns=['inputs'])
    return dataset, tokenized_datasets

In [7]:
def create_tokenized_MT_dataset(en_file, vi_file):
    en_lines = []
    vi_lines = []
    with open(f'{task3}/{en_file}', encoding='utf-8') as file: 
        for i, line in enumerate(file):
            line = line.strip()
            en_lines.append(line)
    with open(f'{task3}/{vi_file}', encoding='utf-8') as file: 
        for i, line in enumerate(file):
            line = line.strip()
            vi_lines.append(line)
    dict_obj = {'inputs': en_lines, 'labels': vi_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched = True, remove_columns=['inputs'])
    return dataset, tokenized_datasets

# Abstractive Summarization

## Full data, padding left

In [6]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
tokenizer.padding_side = "left"

In [9]:
_, train_dataset = create_tokenized_dataset('train.tsv')
_, eval_dataset = create_tokenized_dataset('valid.tsv')
_, test_dataset = create_tokenized_dataset('test.tsv')

Map:   0%|          | 0/112841 [00:00<?, ? examples/s]



Map:   0%|          | 0/24141 [00:00<?, ? examples/s]

Map:   0%|          | 0/26414 [00:00<?, ? examples/s]

In [10]:
train_dataset.save_to_disk(r"D:\Downloads\DS_AI\VDT\MoE\moe\data\tokenized_left_full\train")
eval_dataset.save_to_disk(r"D:\Downloads\DS_AI\VDT\MoE\moe\data\tokenized_left_full\eval")
test_dataset.save_to_disk(r"D:\Downloads\DS_AI\VDT\MoE\moe\data\tokenized_left_full\test")

Saving the dataset (0/2 shards):   0%|          | 0/112841 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26414 [00:00<?, ? examples/s]

In [11]:
special_token_ids = set([
    tokenizer.pad_token_id,
    tokenizer.eos_token_id,
    tokenizer.bos_token_id if hasattr(tokenizer, "bos_token_id") else None,
    tokenizer.sep_token_id if hasattr(tokenizer, "sep_token_id") else None,
    tokenizer.cls_token_id if hasattr(tokenizer, "cls_token_id") else None,
])
special_token_ids = {tid for tid in special_token_ids if tid is not None}

count = 0
for sample in train_dataset:
    for tid in sample['input_ids']:
        if tid not in special_token_ids:
            count += 1
print(f"Số token không phải token đặc biệt: {count/1e6} M")


Số token không phải token đặc biệt: 70.744682 M


## Wiki padding left

In [10]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
tokenizer.padding_side = "left"

In [11]:
_, train_dataset = create_tokenized_dataset('train.tsv', Vnews=False)
_, eval_dataset = create_tokenized_dataset('valid.tsv', Vnews=False)
_, test_dataset = create_tokenized_dataset('test.tsv', Vnews=False)

Map:   0%|          | 0/13707 [00:00<?, ? examples/s]



Map:   0%|          | 0/1957 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

In [16]:
train_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_left_wiki/train")
eval_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_left_wiki/eval")
test_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_left_wiki/test")

Saving the dataset (0/1 shards):   0%|          | 0/13707 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1957 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3916 [00:00<?, ? examples/s]

In [13]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 13707
})

## Full data, padding right

In [8]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
tokenizer.padding_side = "right"

In [9]:
_, train_dataset = create_tokenized_dataset('train.tsv')

Map:   0%|          | 0/112841 [00:00<?, ? examples/s]



KeyboardInterrupt: 

In [10]:
_, eval_dataset = create_tokenized_dataset('valid.tsv')

Map:   0%|          | 0/24141 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [17]:
_, train_dataset = create_tokenized_dataset('train.tsv')
_, eval_dataset = create_tokenized_dataset('valid.tsv')
_, test_dataset = create_tokenized_dataset('test.tsv')

Map:   0%|          | 0/112841 [00:00<?, ? examples/s]

Map:   0%|          | 0/24141 [00:00<?, ? examples/s]

Map:   0%|          | 0/26414 [00:00<?, ? examples/s]

In [18]:
train_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_right_full/train")
eval_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_right_full/eval")
test_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_right_full/test")

Saving the dataset (0/2 shards):   0%|          | 0/112841 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24141 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26414 [00:00<?, ? examples/s]

## Wiki padding right

In [30]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
tokenizer.padding_side = "right"

In [31]:
_, train_dataset = create_tokenized_dataset('train.tsv', Vnews=False)
_, eval_dataset = create_tokenized_dataset('valid.tsv', Vnews=False)
_, test_dataset = create_tokenized_dataset('test.tsv', Vnews=False)

Map:   0%|          | 0/13707 [00:00<?, ? examples/s]

Map:   0%|          | 0/1957 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

In [32]:
train_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_right_wiki/train")
eval_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_right_wiki/eval")
test_dataset.save_to_disk("D:/Downloads/DS_AI/VDT/MoE/moe/data/abstract_summarization/tokenized_right_wiki/test")

Saving the dataset (0/1 shards):   0%|          | 0/13707 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1957 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3916 [00:00<?, ? examples/s]

In [34]:
len(train_dataset[0]['labels'])

1024

# Machine Translation

In [11]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
tokenizer.padding_side = "right"

In [14]:
train, train_dataset = create_tokenized_MT_dataset('train.en', 'train.vi')
test, test_dataset = create_tokenized_MT_dataset('test.en', 'test.vi')

Map:   0%|          | 0/133137 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [14]:
train[1]['inputs']

'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .'

In [15]:
train[1]['labels']

'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .'

In [16]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 133137
})

In [20]:
train_eval_split = train_dataset.train_test_split(test_size=0.1, seed=42)

In [21]:
train_eval_split['train'] 

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 119823
})

In [24]:
train_eval_split['train'].save_to_disk('data/machine_translation/tokenized_right/train')
train_eval_split['test'].save_to_disk('data/machine_translation/tokenized_right/eval')
test_dataset.save_to_disk('data/machine_translation/tokenized_right/test')

Saving the dataset (0/4 shards):   0%|          | 0/119823 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/13314 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]