# Initial Setup

%%capture
!pip install -q datasets
!pip install -q transformers
!pip install -q emoji
!pip install -q vncorenlp
!git clone https://github.com/vncorenlp/VnCoreNLP.git
!pip install wandb

In [None]:
%%capture
!wget https://github.com/Savoxism/ABSA-Vietnamese/archive/refs/heads/main.zip -O ABSA-Vietnamese.zip

In [None]:
!unzip ABSA-Vietnamese.zip -d /kaggle/working/
!ls /kaggle/working/ABSA-Vietnamese-main

In [None]:
%cd /kaggle/working/ABSA-Vietnamese-main

In [None]:
from nltk import flatten
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from vncorenlp import VnCoreNLP

import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt

from huggingface_hub import login
import wandb
from transformers import AutoTokenizer, AutoModel, AutoConfig, Trainer, TrainingArguments

from preprocess import (
    remove_HTML, 
    convert_unicode, 
    standardize_sentence_typing, 
    normalize_acronym, 
    remove_unnecessary_characters,
)

In [None]:
ANNOTATOR_PATH = "/kaggle/working/VnCoreNLP/VnCoreNLP-1.1.1.jar"  # Change to correct path

if not os.path.exists(ANNOTATOR_PATH):
    raise FileNotFoundError(f"VnCoreNLP JAR file not found at {ANNOTATOR_PATH}")

annotator = VnCoreNLP(ANNOTATOR_PATH)

def word_segmentation(text):
    words = annotator.tokenize(text)
    return ' '.join(word for word in flatten(words))

def text_preprocess(text):
    text = remove_HTML(text)
    text = convert_unicode(text) 
    text = standardize_sentence_typing(text)
    text = normalize_acronym(text)
    text = word_segmentation(text) 
    text = remove_unnecessary_characters(text)
    # return text.lower()
    return text

In [None]:
if torch.cuda.is_available():
    print("GPU is available!")
    print("Device:", torch.cuda.get_device_name(0))
else:
    print("GPU not available. Please enable GPU in Kaggle settings.")

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_AUTH_TOKEN")
login(token=hf_token)

In [None]:
wb_token = user_secrets.get_secret("wandb_api_key")

wandb.login(key=wb_token)
run = wandb.init(
    project='ABSA-Vietnamese', 
    job_type="training", 
    anonymous="allow"
)

# Exploratory Data Analysis

In [None]:
TRAIN_PATH = "/kaggle/input/vlsp2018-hotel/1-VLSP2018-SA-Hotel-train.csv"
VAL_PATH = "/kaggle/input/vlsp2018-hotel/2-VLSP2018-SA-Hotel-dev.csv"
TEST_PATH = "/kaggle/input/vlsp2018-hotel/3-VLSP2018-SA-Hotel-test.csv"

In [None]:
raw_datasets = load_dataset('csv', data_files={'train': TRAIN_PATH, 'val': VAL_PATH, 'test': TEST_PATH})
raw_datasets

In [None]:
PRETRAINED_MODEL = 'vinai/phobert-base'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
print("Max Model Input Size:", tokenizer.model_max_length)
print(tokenizer.model_input_names)

In [None]:
tokenizer

In [None]:
# tokens = tokenizer.encode('Tôi là sinh viên trường đại học Công nghệ thông tin .') 
tokens = tokenizer.encode('Tôi là sinh_viên trường đại_học Công_nghệ thông_tin .') # When use PhoBERT
print('Encode:', tokens)
print('Decode:', tokenizer.decode(tokens))

In [None]:
label_keys = [
    key for key in raw_datasets['train'].column_names  
    if key not in ["Review", "input_ids", "token_type_ids", "attention_mask"]
]

In [None]:
label_keys

In [None]:
len(label_keys)

In [None]:
def encode_sentiment(sentiment):
    sentiment_map = {
        0: [1, 0, 0, 0],  # None
        1: [0, 1, 0, 0],  # Positive
        2: [0, 0, 1, 0],  # Negative
        3: [0, 0, 0, 1]   # Neutral
    }
    return sentiment_map.get(sentiment, [1, 0, 0, 0]) 

def clean_text(review):
    # print("used")
    return text_preprocess(review) if isinstance(review, str) else ""

In [None]:
example = raw_datasets["train"][0]
example

In [None]:
# Step 1: Clean text
cleaned_text_result = clean_text(example["Review"])
print("Cleaned Text:", cleaned_text_result)

In [None]:
def clean_review_text(example):
    example["Review"] = clean_text(example["Review"])
    return example

# Apply the cleaning function to all dataset splits (train, val, test)
cleaned_datasets = raw_datasets.map(clean_review_text, batched=False)

In [None]:
cleaned_datasets['train'][0]

In [None]:
def tokenize_cleaned_example(example):
    """Tokenizes the cleaned text and encodes sentiment labels."""

    # Step 1: Tokenize the input text
    tokenized_inputs = tokenizer(
        example["Review"],  # Already cleaned text
        max_length=256, 
        padding="max_length", 
        truncation=True
    )

    aspect_labels = [encode_sentiment(example.get(key, 0)) for key in label_keys]
    tokenized_inputs["labels"] = np.array(aspect_labels, dtype=np.int64)  # Shape: (34, 4)

    return tokenized_inputs

In [None]:
# Apply tokenization to the cleaned dataset
tokenized_dataset = cleaned_datasets.map(tokenize_cleaned_example, batched=False)

In [None]:
tokenized_dataset['train'][0]

In [None]:
# Select only necessary columns
columns_to_keep = ["input_ids", "attention_mask", "labels"]
tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in tokenized_dataset["train"].column_names if col not in columns_to_keep]
)

# Verify the new dataset structure
print(tokenized_dataset)

In [None]:
tokenized_dataset['train'][0]['labels']

# Data Loader

In [None]:
example = tokenized_dataset["train"][0] 
print("Keys in example:", example.keys())  # Should include 'labels'
print("Input IDs:", example["input_ids"][:10])  
print("Labels shape:", len(example["labels"]), "x", len(example["labels"][0]))  # Should be (34, 4)
print("Labels:", example["labels"])  # Should be a list of 34 lists, each with 4 elements

In [None]:
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)

    # Convert labels into a tensor with correct shape (batch_size, 34, 4)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.float)  

    batch_dict = {
        "input_ids": input_ids,         # Shape: (batch_size, seq_length)
        "attention_mask": attention_mask,  # Shape: (batch_size, seq_length)
        "labels": labels  # Shape: (batch_size, 34, 4)
    }

    return batch_dict

# Create DataLoaders for training and evaluation
train_loader = DataLoader(tokenized_dataset["train"], batch_size=8, collate_fn=collate_fn, shuffle=True)
eval_loader = DataLoader(tokenized_dataset["val"], batch_size=8, collate_fn=collate_fn)

In [None]:
batch = next(iter(train_loader))

print("Batch keys:", batch.keys())  # Should include 'labels'
print("Input IDs shape:", batch["input_ids"].shape)  # Expected (batch_size, sequence_length)
print("Labels shape:", batch["labels"].shape)  # Expected (batch_size, 34, 4)

# Initialize Model

In [None]:
class PhoBERTMultiAspectModel(nn.Module):
   def __init__(self, model_name, num_aspects):
        super(PhoBERTMultiAspectModel, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name, output_hidden_states=True)
        self.config = AutoConfig.from_pretrained(model_name)  
        self.hidden_size = self.bert.config.hidden_size * 4 

        # Create a classifier for each aspect (4 logits per aspect)
        self.aspect_classifiers = nn.ModuleList([
            nn.Linear(self.hidden_size, 4) for _ in range(num_aspects)
        ])

        self.dropout = nn.Dropout(0.2)

   def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, 
                            token_type_ids=token_type_ids if token_type_ids is not None else None)
        
        hidden_states = outputs.hidden_states  # Shape: (batch_size, seq_len, hidden_dim)

        # Correctly concatenate the last 4 hidden layers
        pooled_output = torch.cat([hidden_states[-i][:, 0, :] for i in range(1, 5)], dim=-1)
        pooled_output = self.dropout(pooled_output)

        # Apply classifiers to each aspect
        aspect_outputs = [classifier(pooled_output) for classifier in self.aspect_classifiers]

        # Return a structured output: (batch_size, num_aspects, 4)
        return torch.stack(aspect_outputs, dim=1)  

In [None]:
class PhoBERTTrainer(Trainer):
    def get_train_dataloader(self):
        print("✅ Using custom train DataLoader")
        return train_loader  # ✅ Ensures labels are included in batches

    def get_eval_dataloader(self, eval_dataset=None):
        """Forces Trainer to use the custom eval DataLoader."""
        print("✅ Using custom eval DataLoader")  
        return eval_loader  # ✅ Ensures validation labels are included

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Custom loss function to correctly process multi-aspect labels."""
        # print("Keys in inputs:", inputs.keys())  # Debugging print

        if "labels" not in inputs:
            raise ValueError(f"🚨 Missing 'labels' in inputs. Available keys: {inputs.keys()}")

        labels = inputs["labels"]  
        outputs = model(**inputs)  # Forward pass → (batch_size, num_aspects, 4)

        # Convert one-hot encoded labels to class indices
        labels = torch.argmax(labels, dim=-1)  # Shape: (batch_size, num_aspects)

        # Compute loss
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs.view(-1, 4), labels.view(-1)) 

        return (loss, outputs) if return_outputs else loss

# Finetuning

In [None]:
model = PhoBERTMultiAspectModel("vinai/phobert-base", len(label_keys))

In [None]:
training_args = TrainingArguments(
    output_dir="./phobert_multilabel-V2",
    eval_strategy="no",
    save_strategy="no",
    load_best_model_at_end=False,
    num_train_epochs=10,  # Adjust based on dataset size
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    report_to="wandb",
)

In [None]:
trainer = PhoBERTTrainer(
    model=model,
    args=training_args,
    train_dataset=None,  # No need for dataset since we use DataLoader
    eval_dataset=None,
    tokenizer=tokenizer,
)

# Run training
trainer.train()

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
log_history = trainer.state.log_history
df = pd.DataFrame(log_history)
df_loss = df[df['loss'].notna()]

# Plot the training loss over time
plt.figure(figsize=(10, 5))
plt.plot(df_loss['step'], df_loss['loss'], label="Training Loss", color="blue")
plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.title("PhoBERT Training Loss Over Time")
plt.legend()
plt.grid()
plt.show()

# Saving Model

In [None]:
import os
import torch

# Define save path
model_save_path = "./phobert_multilabel-V2"

# Ensure the save directory exists
os.makedirs(model_save_path, exist_ok=True)

# ✅ Force CPU mode while saving
torch.save(model.cpu().state_dict(), f"{model_save_path}/pytorch_model.bin")

# Save tokenizer
tokenizer.save_pretrained(model_save_path)

# Save model config (useful when loading later)
model.config.to_json_file(f"{model_save_path}/config.json")

# Confirm saving is successful
print(f"✅ Model and tokenizer saved to {model_save_path}")

In [None]:
!zip -r phobert-v2-multilabel.zip /kaggle/working/ABSA-Vietnamese-main/phobert_multilabel-V2