In [32]:
import zipfile
import os
import pandas as pd
# 1. Giải nén
zip_path = 'model.zip'
extract_dir = './saved_model'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# 2. Load model và tokenizer từ thư mục đã giải nén
from transformers import BertForTokenClassification, BertTokenizerFast
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForTokenClassification.from_pretrained(extract_dir)
tokenizer = BertTokenizerFast.from_pretrained(extract_dir)

model.to(device)
model.eval()

print("Model và tokenizer đã được load thành công!")


Model và tokenizer đã được load thành công!


In [4]:
id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}

def predict_aspects(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True)
    inputs = {k:v.to(device) for k,v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    
    tokens = tokenizer.tokenize(sentence)
    aspect_terms = []
    current_aspect = []
    
    # Lặp qua từng token và nhãn tương ứng
    for idx, pred in enumerate(predictions[1:len(tokens)+1]):  # Bỏ CLS token ở đầu và SEP token ở cuối
        label = id2label[pred]
        token = tokens[idx]
        
        if label == 'B-ASP':
            if current_aspect:
                aspect_terms.append(tokenizer.convert_tokens_to_string(current_aspect))
                current_aspect = []
            current_aspect = [token]
        elif label == 'I-ASP' and current_aspect:
            current_aspect.append(token)
        else:
            if current_aspect:
                aspect_terms.append(tokenizer.convert_tokens_to_string(current_aspect))
                current_aspect = []
    if current_aspect:
        aspect_terms.append(tokenizer.convert_tokens_to_string(current_aspect))
    
    # Loại bỏ ## và trim space
    aspect_terms = [term.replace('##', '').strip() for term in aspect_terms]
    return aspect_terms

# Đọc file test CSV
test_df = pd.read_csv('restaurants-trial.csv')

# Test và in kết quả
for idx, row in test_df.iterrows():
    sentence = row['Sentence']
    true_aspects = row['Aspect Term'].split(';')  # Giả sử các aspect term cách nhau bằng dấu ;
    pred_aspects = predict_aspects(sentence)
    
    print(f"Sentence: {sentence}")
    print(f"Ground Truth Aspects: {true_aspects}")
    print(f"Predicted Aspects: {pred_aspects}")
    print('-'*50)


Sentence: All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!
Ground Truth Aspects: ['appetizers']
Predicted Aspects: ['appetizers', 'salads', 'steak', 'pasta']
--------------------------------------------------
Sentence: All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!
Ground Truth Aspects: ['salads']
Predicted Aspects: ['appetizers', 'salads', 'steak', 'pasta']
--------------------------------------------------
Sentence: All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!
Ground Truth Aspects: ['steak']
Predicted Aspects: ['appetizers', 'salads', 'steak', 'pasta']
--------------------------------------------------
Sentence: All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!
Ground Truth Aspects: ['pasta']
Predicted Aspects: ['appetizers', 'salads', 'steak', 'pasta']
-----

In [5]:
def extract_aspect_terms(sentence):
    encoding = tokenizer(sentence, return_offsets_mapping=True, return_tensors="pt", truncation=True)
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    offsets = encoding["offset_mapping"][0]  

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits  
    predictions = torch.argmax(logits, dim=-1)[0].cpu().numpy()  

    id2label = {0: 'O', 1: 'B-ASP', 2: 'I-ASP'}

    aspect_terms = []
    current_aspect_tokens = []
    for idx, pred_id in enumerate(predictions):
        label = id2label.get(pred_id, 'O')
        token = tokenizer.convert_ids_to_tokens(int(input_ids[0][idx]))
        
        if label == 'B-ASP':
            if current_aspect_tokens:
                aspect = tokenizer.convert_tokens_to_string(current_aspect_tokens)
                aspect_terms.append(aspect.strip())
                current_aspect_tokens = []
            current_aspect_tokens = [token]
        elif label == 'I-ASP' and current_aspect_tokens:
            current_aspect_tokens.append(token)
        else:
            if current_aspect_tokens:
                aspect = tokenizer.convert_tokens_to_string(current_aspect_tokens)
                aspect_terms.append(aspect.strip())
                current_aspect_tokens = []

    if current_aspect_tokens:
        aspect = tokenizer.convert_tokens_to_string(current_aspect_tokens)
        aspect_terms.append(aspect.strip())

    aspect_terms = list(set(aspect_terms))

    return aspect_terms

# Test
test_sentence = "The battery life is amazing but the screen is too dim."
aspect_terms = extract_aspect_terms(test_sentence)
print("Sentence:", test_sentence)
print("Extracted Aspects:", aspect_terms)


Sentence: The battery life is amazing but the screen is too dim.
Extracted Aspects: ['battery life', 'screen']


In [6]:
print(type(sentence))      
print(type(aspect_terms))   

if isinstance(aspect_terms, list):
    aspect_term = " ".join(aspect_terms)


<class 'str'>
<class 'list'>


In [7]:
for aspect in aspect_terms:
    if not isinstance(aspect, str):
        aspect = " ".join(aspect)
    inputs = tokenizer(sentence, aspect, return_tensors='pt', truncation=True, padding=True)



In [8]:
print(type(sentence))      
print(type(aspect))

<class 'str'>
<class 'str'>


In [9]:
import pandas as pd
df=pd.read_csv('Train.csv')

In [10]:
df.head()


Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12


In [11]:
data = df[['Sentence', 'Aspect Term', 'polarity']].to_dict(orient='records')

In [12]:
print(data[:5])  

[{'Sentence': 'I charge it at night and skip taking the cord with me because of the good battery life.', 'Aspect Term': 'cord', 'polarity': 'neutral'}, {'Sentence': 'I charge it at night and skip taking the cord with me because of the good battery life.', 'Aspect Term': 'battery life', 'polarity': 'positive'}, {'Sentence': 'The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my netbook from.', 'Aspect Term': 'service center', 'polarity': 'negative'}, {'Sentence': 'The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my netbook from.', 'Aspect Term': '"sales" team', 'polarity': 'negative'}, {'Sentence': 'The tech guy then said the service center does not do 1-to-1 exchange and I have to direct my concern to the "sales" team, which is the retail shop which I bought my ne

In [13]:
from torch.utils.data import Dataset

class AspectSentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = {'positive': 0,'negative': 1,'neutral': 2,'conflict': 3}
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['Sentence']
        aspect = item['Aspect Term']
        sentiment = item['polarity']
        
        inputs = self.tokenizer(
            sentence,
            aspect,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        label = self.label2id[sentiment]
        
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [14]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import torch
dataset = AspectSentimentDataset(data, tokenizer, max_len=128)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()
num_epochs = 1
for epoch in range(num_epochs):
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    loop = tqdm(loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits  

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct = (preds == labels).sum().item()
        total_correct += correct
        total_samples += labels.size(0)

        acc = total_correct / total_samples

        loop.set_postfix(loss=f"{loss.item():.4f}", accuracy=f"{acc:.4f}")

    print(f"Epoch {epoch+1} completed, average loss: {total_loss/len(loader):.4f}, accuracy: {acc:.4f}")

Epoch 1: 100%|█| 379/379 [1:21:27<00:00, 12.89s/it, accuracy=0.6895, loss=0

Epoch 1 completed, average loss: 0.7948, accuracy: 0.6895





In [58]:
sentences = [
    "The battery life of this phone is amazing.",
    "The screen is too dim to use outside.",
    "The phone comes in black and white colors."
]

aspects = [
    "battery life",
    "screen",
    "color"
]

inputs = tokenizer(
    [f"{s} [SEP] {a}" for s, a in zip(sentences, aspects)],
    return_tensors="pt",
    truncation=True,
    padding=True
).to(device)

outputs = model(**inputs)
logits = outputs.logits
pred_ids = torch.argmax(logits, dim=1)  # shape [batch_size]

id2label = {0: 'positive', 1: 'negative', 2: 'neutral', 3: 'conflict'}
pred_labels = [id2label[i.item()] for i in pred_ids]

for sent, asp, label in zip(sentences, aspects, pred_labels):
    print(f"Sentence: {sent}")
    print(f"Aspect: {asp}")
    print(f"Predicted sentiment: {label}")
    print("-" * 30)


RuntimeError: a Tensor with 3 elements cannot be converted to Scalar

In [59]:
print(f"pred_ids shape: {pred_ids.shape}")
for i, val in enumerate(pred_ids):
    print(f"pred_ids[{i}]: {val}, shape: {val.shape}, type: {type(val)}")


pred_ids shape: torch.Size([3, 3])
pred_ids[0]: tensor([7, 2, 3]), shape: torch.Size([3]), type: <class 'torch.Tensor'>
pred_ids[1]: tensor([3, 2, 9]), shape: torch.Size([3]), type: <class 'torch.Tensor'>
pred_ids[2]: tensor([ 4, 11,  9]), shape: torch.Size([3]), type: <class 'torch.Tensor'>


In [60]:
print(f"logits shape: {logits.shape}")
print(f"logits sample: {logits[0]}")

# Kiểm tra giá trị argmax ở các chiều
pred_ids_dim1 = torch.argmax(logits, dim=1)
print(f"argmax dim=1 shape: {pred_ids_dim1.shape}")

pred_ids_dim2 = torch.argmax(logits, dim=2)
print(f"argmax dim=2 shape: {pred_ids_dim2.shape}")


logits shape: torch.Size([3, 14, 3])
logits sample: tensor([[ 3.7309, -1.9212, -2.0670],
        [ 5.2071, -2.8100, -2.0040],
        [ 0.2058,  2.5244, -2.0840],
        [ 0.2199, -1.9082,  2.2136],
        [ 3.8750, -2.7296, -1.1596],
        [ 4.7836, -1.9739, -1.9726],
        [ 3.1721, -0.8119, -1.7712],
        [ 5.3619, -2.6456, -2.3874],
        [ 5.3613, -2.5043, -2.3808],
        [ 1.7437, -0.5577, -0.7162],
        [ 2.9965, -1.4894, -0.8913],
        [ 0.8897,  2.0194, -2.4748],
        [ 1.4126, -2.3025,  1.0760],
        [ 2.9953, -1.4891, -0.8906]], grad_fn=<SelectBackward0>)
argmax dim=1 shape: torch.Size([3, 3])
argmax dim=2 shape: torch.Size([3, 14])


In [53]:
print(f"logits shape: {logits.shape}")   # xem shape logits
print(f"logits: {logits}")

pred_ids = torch.argmax(logits, dim=1)
print(f"pred_ids shape: {pred_ids.shape}") 
print(f"pred_ids: {pred_ids}")

for i, val in enumerate(pred_ids):
    print(f"pred_ids[{i}]: {val}, shape: {getattr(val, 'shape', None)}, type: {type(val)}")
    try:
        scalar = val.item()
        print(f"pred_ids[{i}] as scalar: {scalar}")
    except Exception as e:
        print(f"Error converting pred_ids[{i}] to scalar:", e)


logits shape: torch.Size([3, 4])
logits: tensor([[ 0.5470,  0.5108,  1.1177, -1.6352],
        [ 0.2685, -0.3193,  2.1372, -1.9222],
        [-0.2163,  2.4923,  0.1604, -1.0454]], grad_fn=<AddmmBackward0>)
pred_ids shape: torch.Size([3])
pred_ids: tensor([2, 2, 1])
pred_ids[0]: 2, shape: torch.Size([]), type: <class 'torch.Tensor'>
pred_ids[0] as scalar: 2
pred_ids[1]: 2, shape: torch.Size([]), type: <class 'torch.Tensor'>
pred_ids[1] as scalar: 2
pred_ids[2]: 1, shape: torch.Size([]), type: <class 'torch.Tensor'>
pred_ids[2] as scalar: 1


In [19]:
# Giả sử bạn đã huấn luyện xong Model 2 (model và tokenizer là biến bạn đang dùng)
save_path = "./model2"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('./model2\\tokenizer_config.json',
 './model2\\special_tokens_map.json',
 './model2\\vocab.txt',
 './model2\\added_tokens.json',
 './model2\\tokenizer.json')