In [1]:
!pip install peft

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.13.0->peft)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvji

In [1]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW
import os
from peft import get_peft_model, LoraConfig, TaskType

2025-05-24 03:57:41.050345: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748059061.258292      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748059061.320990      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [22]:
# ==== CẤU HÌNH ====
MODEL_NAME = "Salesforce/codet5p-2b"
MAX_LEN = 2048
BATCH_SIZE = 2
EPOCHS = 7
LEARNING_RATE = 3e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
unfreeze_epoch = 2
TOLERANCE = 3
STEP_SIZE = 1
HIDDEN_DIM = 1024

In [23]:
# ==== ĐỌC DỮ LIỆU TỪ post_hoc_id.json ====
def load_data(path):
    with open(path, 'r') as f:
        raw_data = json.load(f)

    samples = []
    for item in raw_data:
        text = item["type"] + " </s> " + item["comment"] + " </s> " + item["code"] #for auto_tokenizer

        label = int(item["label"])  # chuyển về float 0.0 / 1.0
        samples.append((text, label))
    return samples

In [24]:
from torch.utils.data import Dataset
import torch

class CodeT5Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=2048, max_label_len=1):
        """
        data: list of tuples (text, label_int) như [(text, 0), (text, 1), ...]
        tokenizer: AutoTokenizer
        max_input_len: max length input sequence
        max_label_len: max length label sequence, với nhãn binary là 1 token
        """
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_label_len = max_label_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label_int = self.data[idx]
        # Tokenize input
        inputs = self.tokenizer(
            text,
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        # Label là chuỗi "0" hoặc "1"
        label_str = str(label_int)
        labels = self.tokenizer(
            label_str,
            max_length=self.max_label_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        # labels.input_ids shape: [1, max_label_len], squeeze để thành [max_label_len]
        # Chuyển token pad thành -100 để loss function ignore padding token
        labels_ids = labels.input_ids.squeeze(0)
        labels_ids[labels_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs.input_ids.squeeze(0),
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels_ids
        }



In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True)
model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

In [17]:
peft_config = LoraConfig(
    r=8,                      # rank của decomposition
    lora_alpha=32,
    target_modules=["qkv_proj",
    "fc_in"], # các module sẽ được áp dụng LoRA
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # Loại task
)

# Áp dụng LoRA vào model
model = get_peft_model(model, peft_config)


In [None]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel.")
    model = torch.nn.DataParallel(model)

model = model.to(DEVICE)

In [None]:
# Load data, tokenizer, model
data = load_data("/posthoc/post_hoc.json")
val_data = load_data("/posthoc/posthoc_valid.json")

dataset = CodeT5Seq2SeqDataset(data, tokenizer, max_input_len=MAX_LEN, max_label_len=1)
validationset = CodeT5Seq2SeqDataset(val_data, tokenizer, max_input_len=MAX_LEN, max_label_len=1)

train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(validationset, batch_size=BATCH_SIZE)

In [None]:
@torch.no_grad()
def evaluate(model, dataloader, tokenizer):
    model.eval()
    all_preds = []
    all_labels = []

    for batch in dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]


        # Generate prediction token ids (nhãn)
        generated_ids = model.generate(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       max_length=4)
        generated_ids = model.module.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=4)
        preds = []
        for g in generated_ids:
            pred_str = tokenizer.decode(g, skip_special_tokens=True).strip()
            pred = int(pred_str[0]) if pred_str and pred_str[0] in "01" else 0
            preds.append(pred)

        true_labels = []
        for l in labels:
            label_str = tokenizer.decode(l, skip_special_tokens=True).strip()
            label = int(label_str[0]) if label_str and label_str[0] in "01" else 0
            true_labels.append(label)

        all_preds.extend(preds)
        all_labels.extend(true_labels)

    # Compute metrics
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [None]:
import random
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

In [None]:
from torch.cuda.amp import autocast, GradScaler

def train(model, train_loader, val_loader):
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=2, verbose=True)
    scaler = GradScaler()

    best_f1 = -1.0
    ACCUM_ITERS = 8
    patience_counter = 0
    for epoch in range(EPOCHS):
        if patience_counter >= TOLERANCE:
            print("Early stopping triggered due to no F1 improvement.")
            break

        model.train()
        epoch_loss = 0.0
        optimizer.zero_grad()

        print(f"--- Epoch {epoch+1}/{EPOCHS} ---")
        for param_group in optimizer.param_groups:
            print(f"Learning rate: {param_group['lr']}")

        for step, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss / ACCUM_ITERS

            scaler.scale(loss).backward()

            if (step + 1) % ACCUM_ITERS == 0 or (step + 1) == len(train_loader):
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()


            epoch_loss += loss.item() * ACCUM_ITERS

            if (step + 1) % 200 == 0:
                print(f"  Step {step+1}/{len(train_loader)} - Loss: {loss.item() * ACCUM_ITERS:.4f}")

        avg_loss = epoch_loss / len(train_loader)
        metrics = evaluate(model, val_loader, tokenizer)
        scheduler.step(metrics["f1"])

        print(f"Epoch {epoch+1} done. Avg Loss: {avg_loss:.4f}, Val Acc: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")

        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            torch.save(model.state_dict(), "best_model/model.pt")
            print("Best model saved based on F1.")
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"No F1 improvement. Patience: {patience_counter}/{TOLERANCE}")

In [26]:
train(model, train_loader, val_loader)

--- Epoch 1/14 ---
Learning rate: 3e-05


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


OutOfMemoryError: CUDA out of memory. Tried to allocate 52.00 MiB. GPU 1 has a total capacity of 14.74 GiB of which 6.12 MiB is free. Process 4099 has 14.73 GiB memory in use. Of the allocated memory 13.90 GiB is allocated by PyTorch, and 651.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)