#Replace Linear -> tanh -> Linear (2 output)

**Import Libs**

In [1]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
from transformers import RobertaTokenizer, T5EncoderModel, AutoTokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW
import os

2025-05-23 16:46:34.712274: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748018794.884508      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748018794.937298      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# ==== CẤU HÌNH ====
MODEL_NAME = "Salesforce/codet5p-2b"
MAX_LEN = 2048
BATCH_SIZE = 2
EPOCHS = 14
LEARNING_RATE = 3e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
unfreeze_epoch = 2
TOLERANCE = 6
STEP_SIZE = 2
HIDDEN_DIM = 1024

In [3]:
class T5ClassificationHead(nn.Module):
    def __init__(self, input_dim, hidden_dim=HIDDEN_DIM, dropout_prob=0.1, num_labels=2):
        super().__init__()
        self.dense = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.out_proj = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        logits = self.out_proj(x)
        return logits  # [batch_size, num_labels]

In [4]:
class T5Classifier(nn.Module):
    def __init__(self, model_name="Salesforce/codet5-small", hidden_dim=HIDDEN_DIM, num_labels=2):
        super().__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        self.classification_head = T5ClassificationHead(
            input_dim=self.encoder.config.d_model,
            hidden_dim=hidden_dim,
            num_labels=num_labels
        )
        self.freeze_encoder()

    def freeze_encoder(self):
        for param in self.encoder.parameters():
            param.requires_grad = False

    def unfreeze_encoder(self):
        for param in self.encoder.parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = encoder_outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]

        pooled = hidden_state[:, 0, :]  # <s>

        logits = self.classification_head(pooled)  # [batch_size, 2]
        return logits


In [5]:
# ==== ĐỌC DỮ LIỆU TỪ post_hoc_id.json ====
def load_data(path):
    with open(path, 'r') as f:
        raw_data = json.load(f)

    samples = []
    for item in raw_data:
        text = item["type"] + " </s> " + item["comment"] + " </s> " + item["code"] #for auto_tokenizer

        label = int(item["label"])  # chuyển về float 0.0 / 1.0
        samples.append((text, label))
    return samples


In [6]:
# ==== DATASET CLASS DÙNG CHO BINARY CLASSIFICATION ====
class CodeT5Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]

        enc = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float)  # label là 0.0 hoặc 1.0
        }

In [7]:
# Load data, tokenizer, model
data = load_data("./posthoc/post_hoc.json")
val_data = load_data("./posthoc/posthoc_valid.json")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = T5Classifier(MODEL_NAME).to(DEVICE)


dataset = CodeT5Dataset(data, tokenizer, MAX_LEN)
validationset = CodeT5Dataset(val_data, tokenizer, MAX_LEN)

train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(validationset, batch_size=BATCH_SIZE)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/446M [00:00<?, ?B/s]

In [8]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(DEVICE)

Using 2 GPUs!


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

@torch.no_grad()
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE).float()  # [B]

        outputs = model(input_ids, attention_mask)  # shape: [B, 2]
        preds = torch.argmax(outputs, dim=-1) 

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    # Convert to numpy arrays
    all_preds = torch.tensor(all_preds)
    all_labels = torch.tensor(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [14]:
def train():
    # Optimizer & Loss
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler()

    best_acc = 0.0
    best_f1 = -1.0
    ACCUM_ITERS = 8 
    os.makedirs("/best_model", exist_ok=True)
    patience_counter = 0
    for epoch in range(EPOCHS):
        if patience_counter >= TOLERANCE:
            print("Early stopping triggered due to no F1 improvement.")
            break
        if epoch == unfreeze_epoch:
            print(f"Unfreezing encoder at epoch {epoch}")
            model.module.unfreeze_encoder()
            optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)  # re-init optimizer if needed
            scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=2, verbose=True)

        model.train()
        epoch_loss = 0.0

        print(f"--- Epoch {epoch+1}/{EPOCHS} ---")
        for param_group in optimizer.param_groups:
                print(f"[Epoch {epoch+1}] Learning rate: {param_group['lr']}")
        for step, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE).long()

            #outputs = model(input_ids, attention_mask)  # [B]
            #loss = criterion(outputs, labels)
            #loss = loss / ACCUM_ITERS  # Chia loss ra để tránh scale sai
            #loss.backward()

            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss = loss / ACCUM_ITERS
            
            scaler.scale(loss).backward()

            
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            if (step + 1) % ACCUM_ITERS == 0 or (step + 1) == len(train_loader):
                #optimizer.step()
                #optimizer.zero_grad()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            epoch_loss += loss.item() * ACCUM_ITERS  # cộng lại đúng tổng loss

            if (step + 1) % 200 == 0:
                print(f"  Step {step+1}/{len(train_loader)} - Loss: {loss.item() * ACCUM_ITERS:.4f}")

        avg_loss = epoch_loss / len(train_loader)
        metrics = evaluate(model, val_loader)
        f1 = metrics["f1"]

        scheduler.step(f1)
        
        print(f"Epoch {epoch+1} completed. Avg Loss: {avg_loss:.4f}, "
              f"Val Accuracy: {metrics['accuracy']:.4f}, "
              f"Precision: {metrics['precision']:.4f}, "
              f"Recall: {metrics['recall']:.4f}, "
              f"F1: {f1:.4f}")
        
        if f1 > best_f1:
            best_f1 = f1
            patience_counter = 0
            torch.save(model.state_dict(), "/working/model.pt")
            print("Best model saved based on F1.")
        else:
            patience_counter += 1
            print(f"No F1 improvement. Patience: {patience_counter}/{TOLERANCE}")

In [15]:
train()

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


--- Epoch 1/1 ---
[Epoch 1] Learning rate: 3e-05
  Step 200/4124 - Loss: 0.6715
  Step 400/4124 - Loss: 0.6994
  Step 600/4124 - Loss: 0.6605
  Step 800/4124 - Loss: 0.7240
  Step 1000/4124 - Loss: 0.6883
  Step 1200/4124 - Loss: 0.7313
  Step 1400/4124 - Loss: 0.6664
  Step 1600/4124 - Loss: 0.6051
  Step 1800/4124 - Loss: 0.6999
  Step 2000/4124 - Loss: 0.6400
  Step 2200/4124 - Loss: 0.7388
  Step 2400/4124 - Loss: 0.6345
  Step 2600/4124 - Loss: 0.6362
  Step 2800/4124 - Loss: 0.7048
  Step 3000/4124 - Loss: 0.6569
  Step 3200/4124 - Loss: 0.5881
  Step 3400/4124 - Loss: 0.6606
  Step 3600/4124 - Loss: 0.6603
  Step 3800/4124 - Loss: 0.7172
  Step 4000/4124 - Loss: 0.7535
Epoch 1 completed. Avg Loss: 0.6775, Val Accuracy: 0.6089, Precision: 0.5732, Recall: 0.8530, F1: 0.6856
Best model saved based on F1.


In [16]:
model.load_state_dict(torch.load("/kaggle/working/model.pt", map_location="cuda" if torch.cuda.is_available() else "cpu"))

<All keys matched successfully>

In [17]:
test = load_data("/posthoc/posthoc_test.json")
clean_test = load_data("/posthoc/posthoc_clean_test.json")

test_set = CodeT5Dataset(test, tokenizer, MAX_LEN)
test_loader = DataLoader(test_set,batch_size=BATCH_SIZE)


clean_test_set = CodeT5Dataset(clean_test, tokenizer, MAX_LEN)
clean_test_loader = DataLoader(clean_test_set,batch_size=BATCH_SIZE)

In [18]:
import json

# Evaluate on test_loader
test_result = evaluate(model, test_loader)
print("=== Test Set Metrics ===")
print("Accuracy:", test_result["accuracy"])
print("Precision:", test_result["precision"])
print("Recall:", test_result["recall"])
print("F1 Score:", test_result["f1"])

# Evaluate on clean_test_loader
clean_result = evaluate(model, clean_test_loader)
print("\n=== Clean Test Set Metrics ===")
print("Accuracy:", clean_result["accuracy"])
print("Precision:", clean_result["precision"])
print("Recall:", clean_result["recall"])
print("F1 Score:", clean_result["f1"])

=== Test Set Metrics ===
Accuracy: 0.6272819472616633
Precision: 0.5898997134670487
Recall: 0.8351926977687627
F1 Score: 0.6914357682619647

=== Clean Test Set Metrics ===
Accuracy: 0.58
Precision: 0.5606060606060606
Recall: 0.74
F1 Score: 0.6379310344827586
