In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Train.csv')


In [3]:
df.head()

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12


In [4]:
grouped = df.groupby(['id', 'Sentence'])

grouped = df.groupby(['id', 'Sentence'])['Aspect Term'].apply(lambda x: ', '.join(x)).reset_index()

train_df, val_df = train_test_split(grouped, test_size=0.1, random_state=42)


In [5]:
class ATE_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_input_len=128, max_output_len=32):
        self.inputs = df['Sentence'].tolist()
        self.targets = df['Aspect Term'].tolist()
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = "extract aspects: " + self.inputs[idx]
        target_text = self.targets[idx]

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_output_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100  # ignore padding token for loss

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }


In [6]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
train_dataset = ATE_Dataset(train_df, tokenizer)
val_dataset = ATE_Dataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)


In [9]:
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc="Training", leave=False)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    loop = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
    return total_loss / len(dataloader)


num_epochs = 3

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss = train_epoch(model, train_loader, optimizer)
    val_loss = evaluate(model, val_loader)
    print(f"Epoch {epoch+1} | Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f}")



Epoch 1/3


Training:   0%|                                    | 0/198 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
                                                                           

Epoch 1 | Train loss: 1.3586 | Val loss: 0.5189
Epoch 2/3


                                                                           

Epoch 2 | Train loss: 0.6253 | Val loss: 0.4016
Epoch 3/3


                                                                           

Epoch 3 | Train loss: 0.5048 | Val loss: 0.3633




In [10]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)


num_epochs = 3

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer)
    val_loss = evaluate(model, val_loader)
    print(f"Epoch {epoch+1} | Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f}")



                                                                           

KeyboardInterrupt: 

In [11]:

model.save_pretrained('./t5-ate-model')
tokenizer.save_pretrained('./t5-ate-model')

# --- 9. Hàm inference thử ---

def predict_aspects(sentence, model, tokenizer, max_len=128, num_beams=4):
    model.eval()
    input_text = "extract aspects: " + sentence
    inputs = tokenizer(input_text, return_tensors='pt', max_length=max_len, truncation=True).to(device)

    outputs = model.generate(
        **inputs,
        max_length=32,
        num_beams=num_beams,
        early_stopping=True
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

# Ví dụ test
test_sentence = "The battery life is amazing but the screen is dull."
print("Predicted aspects:", predict_aspects(test_sentence, model, tokenizer))

Predicted aspects: battery life, screen


In [14]:
test_sentences = [
    # Chủ đề phim ảnh
    "The acting was phenomenal, but the storyline was predictable.",
    "I loved the cinematography, but the pacing was too slow.",
    "The soundtrack was amazing and the dialogue felt natural.",
    "The plot twists were unexpected, but the ending was disappointing.",
    "The characters were well-developed and very relatable.",
    "The visual effects were stunning, but the script was weak.",
    "The movie had a great message, but the direction lacked focus.",
    "The performance by the lead actor was top-notch.",
    "The editing was smooth and the scenes transitioned well.",
    "The humor was forced, and the romance felt unnecessary.",

    # Chủ đề công nghệ
    "The battery life of this smartphone is incredible, but the camera quality is mediocre.",
    "I really appreciate the fast processor, but the device heats up quickly.",
    "The screen resolution is crystal clear, yet the speaker sound is disappointing.",
    "Charging is fast but the charger cable feels fragile.",
    "The software update fixed many bugs but introduced new ones.",

    # Chủ đề dịch vụ khách hàng
    "The customer service was very helpful and resolved my issue quickly.",
    "Waiting time was too long, but the representative was polite.",
    "They responded promptly, but the solution was not satisfactory.",
    "The support team was unprofessional and rude.",
    "I appreciated the follow-up calls after the purchase.",

    # Chủ đề ẩm thực
    "The pizza crust was crispy and delicious, but the toppings were sparse.",
    "Service was quick, but the waiter forgot our drinks.",
    "The dessert was heavenly, and the coffee was perfectly brewed.",
    "Portions were generous but the main dish lacked flavor.",
    "The ambiance was cozy, and the music set the perfect mood.",

    # Chủ đề du lịch
    "The hotel room was spacious and clean, but the Wi-Fi connection was poor.",
    "I loved the guided tour, but the transportation was uncomfortable.",
    "The beach was pristine and beautiful, though a bit crowded.",
    "The local food was delicious, but the prices were a bit high.",
    "The museum had an impressive collection but lacked clear explanations."
]


for sent in test_sentences:
    aspects = predict_aspects(sent,model, tokenizer)
    print(f"Sentence: {sent}")
    print(f"Extracted Aspects: {aspects}")
    print('-'*60)

Sentence: The acting was phenomenal, but the storyline was predictable.
Extracted Aspects: acting, storyline
------------------------------------------------------------
Sentence: I loved the cinematography, but the pacing was too slow.
Extracted Aspects: cinematography, pacing
------------------------------------------------------------
Sentence: The soundtrack was amazing and the dialogue felt natural.
Extracted Aspects: soundtrack, dialogue
------------------------------------------------------------
Sentence: The plot twists were unexpected, but the ending was disappointing.
Extracted Aspects: plot twists
------------------------------------------------------------
Sentence: The characters were well-developed and very relatable.
Extracted Aspects: characters
------------------------------------------------------------
Sentence: The visual effects were stunning, but the script was weak.
Extracted Aspects: visual effects, script
-------------------------------------------------------