# 1. Preparing the Dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import re 

In [None]:
data_path_1 = '/kaggle/input/vietnamese-comment/comments.csv'
data_path_2 = '/kaggle/input/vietnamese-comment/extra.csv' 

In [None]:
data_1 = pd.read_csv(data_path_1)
data_1.head()

In [None]:
len(data_1) 

In [None]:
data_2 = pd.read_csv(data_path_2)
data_2.head()

In [None]:
len(data_2) 

In [None]:
# data_1 = data_1.drop(columns=['title'])
# data = pd.concat([data_1, data_2])

data = data_1

In [None]:
data.head(10) 

In [None]:
len(data) 

In [None]:
missing_values = data.isnull().sum()
print("Dữ liệu bị thiếu:\n", missing_values)

In [None]:
duplicates = data.duplicated(subset=["content"]).sum()
print("Số câu trùng lặp:", duplicates)

# 2. Data Preprocessing

## 2.1 Data Cleaning

In [None]:
# data = data.dropna(subset=["content"])           # bỏ comment rỗng
data = data.dropna(subset=['title', 'content']) 

In [None]:
len(data)

In [None]:
import unicodedata

In [None]:
# Dictionary for common Vietnamese slang/abbreviations
abbreviations = {
    "ko": "không",
    "sp": "sản phẩm",
    "k": "không",
    "m": "mình",
    "đc": "được",
    "dc": "được",
    "h": "giờ",
    "trloi": "trả lời",
    "cg": "cũng",
    "bt": "bình thường",
    "dt": "điện thoại",
    "mt": "máy tính",
    "m.n": "mọi người"
    # add more slang mappings
}

# Regex patterns
url_pattern = r"http\S+|www\S+"  # URLs
user_pattern = r"@\w+"  # usernames
emoji_pattern = re.compile(
    "["  # start
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags
    "]+", flags=re.UNICODE)
emoticon_pattern = r"[:;=8][\-o\*']?[\)\]\(\[dDpP/:}\{@\|\\]"  # emoticons
repeat_pattern = re.compile(r"(.)\1{2,}")  # 3 or more repeats

In [None]:
def clean_text(text: str) -> str:
    # Unicode normalization
    text = str(text)
    text = unicodedata.normalize('NFC', text)  # Chuẩn hoá Unicode rõ ràng (căn bản)

    # Lowercase
    text = text.lower()

    # Remove URLs and usernames
    text = re.sub(url_pattern, '', text)
    text = re.sub(user_pattern, '', text)

    # Remove emojis and emoticons
    text = emoji_pattern.sub(' ', text)
    text = re.sub(emoticon_pattern, ' ', text)

    # Expand common abbreviations
    def expand(match):
        word = match.group(0)
        return abbreviations.get(word, word)

    if abbreviations:
        pattern = re.compile(r"\b(" + "|".join(map(re.escape, abbreviations.keys())) + r")\b")
        text = pattern.sub(expand, text)

    # Remove repeated characters (e.g., "quaaa" -> "qua" )
    text = repeat_pattern.sub(r"\1", text)

    # Remove punctuation (keep Vietnamese letters & numbers)
    text = re.sub(r"[^\w\s\u00C0-\u024F]", ' ', text)

    # Remove extra whitespace
    text = re.sub(r"\s+", ' ', text).strip()

    return text

In [None]:
sample = "Tui thik qááá!!! 😊😊, thanks @ban http://example.com"
print(clean_text(sample))  # Expected: "tui thích qua cảm ơn"

In [None]:
data["text"] = data["content"].apply(clean_text)

In [None]:
data.head()

In [None]:
data = data.groupby('text', as_index=False)['rating'].mean()  

In [None]:
data['rating'] = np.floor(data['rating']).astype(int) 

In [None]:
duplicates = data.duplicated(subset=["text"]).sum()
print("Số câu trùng lặp sau xử lý:", duplicates) 

In [None]:
def sentiment(r):
    if r >= 4: return "tích cực"
    if r == 3: return "bình thường"
    return "tiêu cực"
def label(r):
    if r >= 4: return 2
    if r == 3: return 1
    return 0
data["sentiment"] = data["rating"].apply(sentiment)
data["label"] = data["rating"].apply(label)

In [None]:
data.head()

## 2.2 Statistical Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("Số lượng mẫu:", data.shape[0])

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=data["sentiment"], palette="coolwarm")
plt.title("Phân phối nhãn cảm xúc")
plt.xlabel("Cảm xúc")
plt.ylabel("Số lượng mẫu")
plt.show()

In [None]:
missing_values = data.isnull().sum()
print("Dữ liệu bị thiếu:\n", missing_values)

In [None]:
label_counts = data["sentiment"].value_counts()
print("Số lượng mỗi nhãn:\n", label_counts)

In [None]:
!pip install underthesea 

In [None]:
from underthesea import word_tokenize
import nltk
import wordcloud

In [None]:
data['corpus'] = data['text'].map(lambda text: word_tokenize(text, format="text")) 

In [None]:
data.sample(10) 

In [None]:
# Create bag of words
# Flatten the list of lists into a single list of words
all_words_flat = []
for tokens in data['corpus'].tolist():
    if tokens and tokens != '':
        all_words_flat.extend(tokens.split())

# Create FreqDist from the flattened list
all_words_dist = nltk.FreqDist(all_words_flat)

In [None]:
# Print the total number of words and the 15 most common words
print('Tổng số từ: {}'.format(len(all_words_dist)))
print('Từ xuất hiện nhiều: {}'.format(all_words_dist.most_common(15)))

In [None]:
corpus = " ".join(all_words_flat)
plt.figure(figsize=(12,8))
word_cloud = wordcloud.WordCloud(max_words=100, background_color ="black", width=2000, height=1000, mode="RGB").generate(corpus)
plt.axis("off")
plt.imshow(word_cloud)

## 2.3 Oversampling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    data['text'],
    data['label'],
    test_size=0.2,
    stratify=data['label'],
    random_state=42
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()

train_sentences_tfidf = vectorizer.fit_transform(train_sentences)
test_sentences_tfidf = vectorizer.transform(test_sentences) 

### 2.3.1 RandomOverSampler 

In [None]:
!pip install scikit-learn==1.3.0 

In [None]:
from imblearn.over_sampling import RandomOverSampler 

In [None]:
from collections import Counter

In [None]:
ros = RandomOverSampler(random_state=42)

In [None]:
train_sentences_tfidf, train_labels = ros.fit_resample(train_sentences_tfidf, train_labels)

In [None]:
Counter(train_labels)

### 2.3.2 SMOTE 

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN 

In [None]:
train_labels.value_counts()

In [None]:
smote = SMOTE(sampling_strategy='not majority') 
train_sentences_tfidf, train_labels = smote.fit_resample(train_sentences_tfidf, train_labels)

In [None]:
adasyn = ADASYN(sampling_strategy='not majority') 
train_sentences_tfidf, train_labels = adasyn.fit_resample(train_sentences_tfidf, train_labels)

In [None]:
Counter(train_labels) 

# Extract aspects 

In [None]:
seed_aspects = {
    'vận_chuyển': ['giao hàng', 'tiki giao', 'nhận hàng'],
    'đóng_gói': ['đóng gói', 'bao_bì'],
    'sản_phẩm': ['cuốn sách', 'với giá', 'chất_lượng', 'sản_phẩm'] 
}

def extract_aspects(text, seed_aspects, vocab):
    """
    Trả về list các tuple: (aspect_key, aspect_phrase, start_idx, end_idx)
    start/end là index token trong tokenized sentence (inclusive).
    """
    tokenized = vocab.tokenize_corpus([text])[0]  # assumes this returns list of tokens
    t_low = [t.lower() for t in tokenized]
    found = []
    for asp_key, kws in seed_aspects.items():
        for kw in kws:
            kw_tokens = kw.lower().split()
            L = len(kw_tokens)
            if L == 0:
                continue
            for i in range(len(t_low) - L + 1):
                if t_low[i:i+L] == kw_tokens:
                    phrase = " ".join(tokenized[i:i+L])
                    found.append((asp_key, phrase, i, i+L-1))
                    # break to avoid duplicate matches for same kw in same sentence
                    break
    return tokenized, found

def get_context_string(tokens, start, end, window=3):
    left = max(0, start - window)
    right = min(len(tokens)-1, end + window)
    return " ".join(tokens[left:right+1])

# 3. ML Model

## 3.1 Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
nb = MultinomialNB()
nb.fit(train_sentences_tfidf, train_labels)

In [None]:
pred = nb.predict(test_sentences_tfidf)

In [None]:
print("Accuracy:", accuracy_score(test_labels, pred))
print("Classification Report:")
print(classification_report(test_labels, pred))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, pred))

In [None]:
label2sen = {0: "Tiêu cực", 1: "Bình thường", 2: "Tích cực"} 

In [None]:
print("Chạy Naive Bayes (gõ 'thoát' để dừng).")
while True:
    input_text = input("Nhập câu cần kiểm tra: ").strip()
    if input_text.lower() == "thoát":
        print("Chúc một ngày tốt lành !")
        break

    # tokenization + aspect extraction
    tokenized, aspects = extract_aspects(clean_text(input_text), seed_aspects, vocab)

    if len(aspects) == 0:
        # fallback: toàn câu
        vec = vectorizer.transform([clean_text(input_text)])
        pred = model.predict(vec)[0]
        print("Không tìm thấy aspect. Dự đoán cảm xúc toàn câu:", label2sen[pred], "\n")
    else:
        print(f"Tìm thấy {len(aspects)} aspect:")
        for asp_key, asp_phrase, s, e in aspects:
            context = get_context_string(tokenized, s, e, window=3)
            vec = vectorizer.transform([context])
            pred = model.predict(vec)[0]
            print(f" - Aspect '{asp_key}' (\"{asp_phrase}\") => {label2sen[pred]} (context: {context})")
        print()

## 3.2 Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=200, random_state=0)

In [None]:
clf.fit(train_sentences_tfidf, train_labels)

In [None]:
pred = clf.predict(test_sentences_tfidf) 

In [None]:
print("Accuracy:", accuracy_score(test_labels, pred))
print("Classification Report:")
print(classification_report(test_labels, pred))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, pred))

# 4. DL Model

## 4.1 Word Embedding

In [None]:
!pip install torch==2.2.0 

In [None]:
!pip install torchtext==0.17.0 

In [None]:
import torch 
import torchtext.vocab as vocab

In [None]:
input_path = '/kaggle/input/vietnamese-comment/vi_word2vec.txt'
output_path = '/kaggle/working/vi_word2vec_reduced.txt' 
max_lines = 100000  # Số dòng bạn muốn giữ lại 

In [None]:
with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
    for i, line in enumerate(infile):
        if i > max_lines:
            break
        outfile.write(line) 

In [None]:
word_embedding = vocab.Vectors(name = '/kaggle/working/vi_word2vec_reduced.txt', unk_init = torch.Tensor.normal_)
word_embedding.vectors.shape

In [None]:
def get_vector(embeddings, word):
    assert word in embeddings.stoi, f'*{word}* is not in the vocab!'
    return embeddings.vectors[embeddings.stoi[word]]

def closest_words(embeddings, vector, n=10):
    distances = [(word, torch.dist(vector, get_vector(embeddings, word)).item())
                 for word in embeddings.itos]

    return sorted(distances, key = lambda w: w[1])[:n]

In [None]:
word_vector = get_vector(word_embedding, "Lạc_Long_Quân")

closest_words(word_embedding, word_vector, n=20)

## 4.2 Vocabulary Class

In [None]:
from tqdm import tqdm

In [None]:
class Vocabulary:
    def __init__(self):
        self.word2id = dict()
        self.word2id['<pad>'] = 0   # Pad Token
        self.word2id['<unk>'] = 1   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __len__(self):
        return len(self.word2id)

    def id2word(self, word_index):
        return self.id2word[word_index]

    def add(self, word):
        if word not in self:
            word_index = self.word2id[word] = len(self.word2id)
            self.id2word[word_index] = word
            return word_index
        else:
            return self[word]

    @staticmethod
    def tokenize_corpus(corpus):
        print("Tokenize the corpus...")
        tokenized_corpus = list()
        for document in tqdm(corpus):
            tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
            tokenized_corpus.append(tokenized_document)

        return tokenized_corpus

    def corpus_to_tensor(self, corpus, is_tokenized=False):
        if is_tokenized:
            tokenized_corpus = corpus
        else:
            tokenized_corpus = self.tokenize_corpus(corpus)
        indicies_corpus = list()
        for document in tqdm(tokenized_corpus):
            indicies_document = torch.tensor(list(map(lambda word: self[word], document)),
                                             dtype=torch.int64)
            indicies_corpus.append(indicies_document)

        return indicies_corpus

    def tensor_to_corpus(self, tensor):
        corpus = list()
        for indicies in tqdm(tensor):
            document = list(map(lambda index: self.id2word[index.item()], indicies))
            corpus.append(document)

        return corpus

    # def add_words_from_corpus(self, corpus, is_tokenized=False):
    #     print("Add words from the corpus...")
    #     if is_tokenized:
    #         tokenized_corpus = corpus
    #     else:
    #         tokenized_corpus = self.tokenize_corpus(corpus)
    #     word_freq = Counter(chain(*tokenized_corpus))
    #     non_singletons = [w for w in word_freq if word_freq[w] > 1]
    #     print(f"Number of words in the corpus: {len(word_freq)}")
    #     print(f"Number of words with frequency > 1: {len(non_singletons)}")
    #     for word in non_singletons:
    #         self.add(word)

In [None]:
corpus_sample = ["Đẹp lắm mn ơi k ngờ fahasa bán alb thơ này của Lana lun, bh khó mua lắm",
                 "Shop giao hàng nhanh, đóng gói hàng cẩn thận. Mặc dù sách có bé hơn mình nghĩ nhưng shop rất chu đáo. Vì mình mua gần tết nên có đc tặng thêm cả lì xì nữa. Rất đáng tiền. Mn mua ủng hộ shop nhé.",
                 "lần đầu mua nhưng ok lắm luôn sắp tết nên đc tặng tập lì xì sách nhỏ nhưng bọc hộp đầy đủ đặc biệt tặng cả voucher cho lần sau chỉ có cái sách được bọc bằng màng thực phẩm"]

Vocabulary.tokenize_corpus(corpus_sample)

In [None]:
vocab = Vocabulary()

# create vocabulary from pretrained word2vec
words_list = list(word_embedding.stoi.keys())
for word in words_list:
    vocab.add(word)

# test the vocabulary
tensor = vocab.corpus_to_tensor(corpus_sample)
corpus = vocab.tensor_to_corpus(tensor)
" ".join(corpus[0])

## 4.3 CommentDataset Class

In [None]:
from scipy.linalg.special_matrices import dft
from torch.utils.data import Dataset

In [None]:
class CommentDataset(Dataset):

    def __init__(self, vocab, df, tokenized_fpath=None):
        self.vocab = vocab
        self.pad_idx = vocab["<pad>"]
        df = df
        self.sentiments_list = list(df.label)
        self.reviews_list = list(df.text)

        sentiments_type = list(set(self.sentiments_list))
        sentiments_type.sort()

        self.sentiment2id = {sentiment: i for i, sentiment in enumerate(sentiments_type)}

        if tokenized_fpath:
            self.tokenized_reviews = torch.load(tokenized_fpath)
        else:
            self.tokenized_reviews = self.vocab.tokenize_corpus(self.reviews_list)

        self.tensor_data = self.vocab.corpus_to_tensor(self.tokenized_reviews, is_tokenized=True)
        self.tensor_label = torch.tensor([self.sentiment2id[sentiment] for sentiment in self.sentiments_list],
                                         dtype=torch.float64)

        self.tensor_data, self.tensor_label = zip(*[(data, label) for data, label in zip(self.tensor_data, self.tensor_label) if len(data) > 0])
        self.tensor_data = list(self.tensor_data)
        self.tensor_label = torch.tensor(self.tensor_label, dtype=torch.float64) # Convert back to tensor

    def __len__(self):
        return len(self.tensor_data)

    def __getitem__(self, idx):
        return self.tensor_data[idx], self.tensor_label[idx]

    def collate_fn(self, examples):
        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)

        reviews = [e[0] for e in examples]
        reviews = torch.nn.utils.rnn.pad_sequence(reviews,
                                                  batch_first=False,
                                                  padding_value=self.pad_idx)
        reviews_lengths = torch.tensor([len(e[0]) for e in examples])
        sentiments = torch.tensor([e[1] for e in examples])

        return {"reviews": (reviews, reviews_lengths), "sentiments": sentiments}

In [None]:
from sklearn.utils import resample 

In [None]:
df_train = pd.DataFrame({
    'text': train_sentences,
    'label': train_labels
})

df_pos = df_train[df_train.label == 2]   # positive
df_neg = df_train[df_train.label == 0]   # negative
df_neu = df_train[df_train.label == 1]   # neutral

max_n = df_train.label.value_counts().max()

df_neg_up = resample(df_neg,
                     replace=True,
                     n_samples=max_n,
                     random_state=42)
df_neu_up = resample(df_neu,
                     replace=True,
                     n_samples=max_n,
                     random_state=42)

df_pos_up = df_pos

train_balanced = pd.concat([df_pos_up, df_neg_up, df_neu_up])
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
valid_df = train_balanced.sample(frac=0.2, random_state=42).reset_index()
train_df = train_balanced.drop(valid_df.index).reset_index()
test_df = pd.DataFrame({
    'text': test_sentences,
    'label': test_labels
}).reset_index()

In [None]:
valid_df['label'].value_counts()

In [None]:
 valid_df.drop(columns=['index'], inplace=True)
 train_df.drop(columns=['index'], inplace=True)
 test_df.drop(columns=['index'], inplace=True)

In [None]:
train_dataset = CommentDataset(vocab, train_df)
valid_dataset = CommentDataset(vocab, valid_df)
test_dataset = CommentDataset(vocab, test_df)

## 4.4 Create DataLoader 

In [None]:
from torch.utils.data import DataLoader

In [None]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate_fn)
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, collate_fn=valid_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, collate_fn=test_dataset.collate_fn)

## 4.5 RNN Model

In [None]:
import torch.nn as nn

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,
                 bidirectional, dropout, pad_idx, n_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), n_classes)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths.to('cpu'), enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
        else:
            hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

In [None]:
input_dim = word_embedding.vectors.shape[0] 
embedding_dim = 100
hidden_dim = 8  
n_layers = 2
bidirectional = False 
dropout = 0.3 
pad_idx = vocab["<pad>"]
unk_idx = vocab["<unk>"]
n_classes = 3  # positive, neutral, negative

model = RNN(input_dim, embedding_dim, hidden_dim, n_layers, bidirectional, dropout, pad_idx, n_classes)

In [None]:
model.embedding.weight.data.copy_(word_embedding.vectors)
model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

## 4.6 Train the model

In [None]:
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device) 

model = model.to(device)

In [None]:
def compute_metrics_1(preds, labels):
    acc = accuracy_score(labels, preds)
    return acc 

In [None]:
def compute_metrics_2(preds, labels):
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    return acc, precision, recall, f1

In [None]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        optimizer.zero_grad()
        reviews, lengths = batch['reviews']
        reviews, lengths = reviews.to(device), lengths.to(device)
        logits = model(reviews, lengths)
        labels = batch['sentiments'].long().squeeze(-1).to(device)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        preds = logits.argmax(dim=1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().tolist())

    acc = compute_metrics_1(all_preds, all_labels)
    return epoch_loss / len(dataloader), acc 

In [None]:
def evaluate_1(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            reviews, lengths = batch['reviews']
            reviews, lengths = reviews.to(device), lengths.to(device)
            logits = model(reviews, lengths)
            labels = batch['sentiments'].long().squeeze(-1).to(device)

            loss = criterion(logits, labels)
            epoch_loss += loss.item()

            preds = logits.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().tolist())

    acc = compute_metrics_1(all_preds, all_labels)
    return epoch_loss / len(dataloader), acc

In [None]:
def evaluate_2(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            reviews, lengths = batch['reviews']
            reviews, lengths = reviews.to(device), lengths.to(device)
            logits = model(reviews, lengths)
            labels = batch['sentiments'].long().squeeze(-1).to(device)

            loss = criterion(logits, labels)
            epoch_loss += loss.item()

            preds = logits.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().tolist())

    acc, precision, recall, f1 = compute_metrics_2(all_preds, all_labels)
    return epoch_loss / len(dataloader), acc, precision, recall, f1

In [None]:
import time

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
n_epochs = 5

best_valid_loss = float("inf")

for epoch in range(n_epochs):
    start_time = time.time()

    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate_1(model, valid_dataloader, criterion, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"  Train - loss: {train_loss:.3f}| acc: {train_acc:.2f}")
    print(f"  Valid - loss: {valid_loss:.3f}| acc: {valid_acc:.2f}")

## 4.7 Test the model 

In [None]:
test_loss, test_acc, test_prec, test_rec, test_f1 = evaluate_2(model, test_dataloader, criterion, device)

print(f"Test - loss: {test_loss:.3f}| acc: {test_acc:.2f}| prec: {test_prec:.2f}| rec: {test_rec:.2f}| f1: {test_f1:.2f}")

In [None]:
import torch.nn.functional as F 

In [None]:
def predict_sentiment(model, sentence, vocab, device, label_mapping=None):
    model.eval()

    # Convert sentence to tensor of token indices
    corpus = [sentence]
    tensor = vocab.corpus_to_tensor(corpus)[0].to(device)        # [seq_len]
    tensor = tensor.unsqueeze(1)                                 # [seq_len, 1]
    length_tensor = torch.LongTensor([tensor.size(0)]).to(device)

    # Forward pass
    with torch.no_grad():
        logits = model(tensor, length_tensor).squeeze(0)         # [n_classes]
        probs = F.softmax(logits, dim=-1)                       # [n_classes]

    # Predicted class index and optional label name
    pred_idx = probs.argmax().item()
    pred_label = label_mapping[pred_idx] if label_mapping is not None else str(pred_idx)

    # Return index, label, and full probability distribution
    return pred_label, probs.cpu().tolist()

In [None]:
label_map = {0: 'tiêu cực', 1: 'bình thường', 2: 'tích cực'}

In [None]:
print("Chạy BiLSTM (gõ 'thoát' để dừng).")
while True:
    input_text = input("Nhập câu cần kiểm tra: ").strip()
    if input_text.lower() == "thoát":
        print("Chúc một ngày tốt lành!")
        break

    tokenized, aspects = extract_aspects(clean_text(input_text), seed_aspects, vocab)

    if len(aspects) == 0:
        # fallback: toàn câu
        sent, probs = predict_sentiment(model=model,
                                        sentence=clean_text(input_text),
                                        vocab=vocab,
                                        device=device,
                                        label_mapping=label_map)
        print(f"Dự đoán cảm xúc toàn câu: {sent}\n")
    else:
        print(f"Tìm thấy {len(aspects)} aspect:")
        for asp_key, asp_phrase, s, e in aspects:
            context_tokens = tokenized  # we already have tokens; build small window
            context = get_context_string(context_tokens, s, e, window=3)
            sent, probs = predict_sentiment(model=model,
                                            sentence=context,
                                            vocab=vocab,
                                            device=device,
                                            label_mapping=label_map)
            print(f" - Aspect '{asp_key}' (\"{asp_phrase}\") → {sent} (context: {context})")
        print()