# I. Load model and first test

In [1]:
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
cache_dir = 'google/flan-t5-xl'

# III. Fine tuning

In [4]:
%uv pip install transformers datasets peft accelerate sentencepiece bitsandbytes


[2mUsing Python 3.10.19 environment at: /usr/local[0m
[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mtransformers==4.57.3                                                          [0m[2K[37m⠙[0m [2mdatasets==4.4.2                                                               [0m[2K[37m⠙[0m [2mpeft==0.18.0                                                                  [0m[2K[37m⠙[0m [2maccelerate==1.12.0                                                            [0m[2K[37m⠙[0m [2msentencepiece==0.2.1                                                          [0m[2K[37m⠙[0m [2mbitsandbyt

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import torch

model_name = "google/flan-t5-xl"

# 1. Dùng T5TokenizerFast nếu có thể để tăng tốc xử lý
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Load model với định dạng float16 (giảm 1/2 VRAM so với mặc định)
# Hoặc dùng load_in_8bit=True nếu máy yếu
model = T5ForConditionalGeneration.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype=torch.float16 
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files:   0%|                                                                  | 0/2 [00:00<?, ?it/s]Fetching 2 files:  50%|█████████████████████████████                             | 1/2 [00:44<00:44, 44.81s/it]Fetching 2 files: 100%|██████████████████████████████████████████████████████████| 2/2 [00:44<00:00, 22.41s/it]
Loading checkpoint shards:   0%|                                                         | 0/2 [00:00<?, ?it/s]Loading checkpoint shards:  50%|████████████████████████▌                        | 1/2 [00:01<00:01,  1.86s/it]Loading checkpoint shards: 100%|█████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.00s/it]Loading checkpoint shards: 100%|█████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.13s/it]


In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,                 # rank của ma trận thấp (càng cao → càng chính xác, nhưng tốn VRAM hơn)
    lora_alpha=32,       # hệ số điều chỉnh
    target_modules=["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
    lora_dropout=0.05,   # tránh overfitting
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 35,389,440 || all params: 2,885,146,624 || trainable%: 1.2266


### Dataset

In [8]:
from datasets import load_dataset

data = load_dataset('json', data_files ='filtered_new_data.json', field = None)

Generating train split: 0 examples [00:00, ? examples/s]Generating train split: 8752 examples [00:00, 18778.89 examples/s]Generating train split: 8752 examples [00:00, 18718.84 examples/s]


In [9]:
data

DatasetDict({
    train: Dataset({
        features: ['Document', 'Summary'],
        num_rows: 8752
    })
})

In [10]:
train_data = data['train']
train_data

Dataset({
    features: ['Document', 'Summary'],
    num_rows: 8752
})

In [11]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(examples):
    # Thêm prompt để model hiểu nhiệm vụ
    inputs = ["summarize: " + doc for doc in examples["Document"]] 
    targets = examples["Summary"]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



tokenized_datasets = train_data.map(
    preprocess_function, 
    batched=True,
    remove_columns=data['train'].column_names 
)
# train_test_split tokenized_datasets
tokenized_train = tokenized_datasets.train_test_split(test_size=0.1, shuffle=True, seed = 42)
tokenized_eval = tokenized_train['test']
print(len(tokenized_train['train']))
print(len(tokenized_eval))

Map:   0%|                                                                     | 0/8752 [00:00<?, ? examples/s]Map:  11%|██████▍                                                 | 1000/8752 [00:00<00:05, 1367.18 examples/s]Map:  23%|████████████▊                                           | 2000/8752 [00:01<00:04, 1639.30 examples/s]Map:  34%|███████████████████▏                                    | 3000/8752 [00:01<00:03, 1776.62 examples/s]Map:  46%|█████████████████████████▌                              | 4000/8752 [00:02<00:02, 1811.23 examples/s]Map:  57%|███████████████████████████████▉                        | 5000/8752 [00:02<00:02, 1838.30 examples/s]Map:  69%|██████████████████████████████████████▍                 | 6000/8752 [00:03<00:01, 1803.41 examples/s]Map:  80%|████████████████████████████████████████████▊           | 7000/8752 [00:03<00:00, 1785.12 examples/s]Map:  91%|███████████████████████████████████████████████████▏    | 8000/8752 [00:04<00:00, 1771.84 exa

7876
876





In [12]:
print(len(tokenized_train['train']))
print(len(tokenized_eval))

7876
876


In [13]:
tokenized_eval

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 876
})

In [18]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="gg_flan_t5_xl",
    num_train_epochs=4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,  # nếu GPU nhỏ
    save_steps = 200,
    logging_steps=100,
    learning_rate=1e-4,             # LR cao hơn fine-tune full
    per_device_eval_batch_size=4,
    logging_dir = './logs',
    # logging_strategy='steps',
    # eval_strategy="epoch",
    # save_strategy="epoch",
    fp16=False,
    bf16=True,
    remove_unused_columns=False,
    # load_best_model_at_end=True,
    # save_total_limit=2,
    # report_to="none"
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train['train'],
    eval_dataset=tokenized_eval,
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
100,1.2751
200,1.1506
300,1.1096
400,1.0612
500,1.0256
600,0.9866
700,0.974
800,0.9771
900,0.9501
1000,0.9266


TrainOutput(global_step=7876, training_loss=0.8384501809332805, metrics={'train_runtime': 9894.6099, 'train_samples_per_second': 3.184, 'train_steps_per_second': 0.796, 'total_flos': 5.391919530665902e+17, 'train_loss': 0.8384501809332805, 'epoch': 4.0})

In [19]:
model.save_pretrained("lora_flan_t5_xl_adapter")


In [20]:
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 2048)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 2048)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
      

In [24]:
num_samples = 5
test_samples = tokenized_eval.select(range(num_samples))

print(f"--- PREDICTION RESULTS (First {num_samples} samples) ---")

for i, sample in enumerate(test_samples):
    # Prepare input: Convert list to tensor and move to GPU
    input_ids = torch.tensor([sample['input_ids']]).to(device)
    
    # 3. Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=164,      # Maximum length of generated text
            num_beams=4,             # Beam search for better quality
            repetition_penalty=2.5,  # Penalize repetition
            length_penalty=1.0,
            early_stopping=True
        )

    # 4. Decode tokens back to text
    # Input text
    input_text = tokenizer.decode(sample['input_ids'])
    
    # Generated summary
    generated_text = tokenizer.decode(outputs[0])

    # Actual/Ground Truth summary
    reference_text = tokenizer.decode(sample['labels'])

    # 5. Print results
    print(f"\nExample {i+1}:")
    print(f"Document (truncated): {input_text}")
    print(f"Generated Summary:    {generated_text}")
    print(f"Actual Summary:       {reference_text}")
    print("-" * 50)

--- PREDICTION RESULTS (First 5 samples) ---

Example 1:
Document (truncated): summarize: Ngày 7-6 , ông H<unk> Qu<unk>c D<unk>ng – ch<unk> t<unk>ch UBND t<unk>nh B<unk>nh <unk>nh – cho bi<unk>t Ban cán s<unk> <unk>ng UBND t<unk>nh v<unk>a h<unk>p v<unk> tr<unk>ng h<unk>p c<unk>a ông Tr<unk>ng H<unk>i <unk>n - phó giám <unk>c S<unk> Lao <unk>ng - th<unk>ng binh và x<unk> h<unk>i t<unk>nh B<unk>nh <unk>nh .<unk>ng D<unk>ng nói <unk> k<unk> quy<unk>t <unk>nh thành l<unk>p h<unk>i <unk>ng xem xét k<unk> lu<unk>t <unk>i v<unk>i nh<unk>ng sai ph<unk>m c<unk>a ông <unk>n ." M<unk>t t<unk> công tác c<unk>a h<unk>i <unk>ng k<unk> lu<unk>t s<unk> xác minh <unk>y <unk> , khách quan vi<unk>c ông <unk>n xin ngh<unk> <unk>m <unk> <unk>i <unk>i<unk>u tr<unk> b<unk>nh h<unk>ng tháng t<unk>i các b<unk>nh vi<unk>n <unk> TP. HCM nh<unk> th<unk> nào <unk> có h<unk>ng x<unk> l<unk> <unk>ng theo quy <unk>nh " – ông D<unk>ng cho bi<unk>t .Theo ông Nguy<unk>n M<unk> Quang - giám <unk>c S<unk> Lao <unk>ng - t