# 1. Preparing the Dataset

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/vietnamese-comment/comments.csv
/kaggle/input/vietnamese-comment/vi_word2vec.txt
/kaggle/input/vietnamese-comment/extra.csv


In [3]:
import pandas as pd
import re 

In [4]:
data_path_1 = '/kaggle/input/vietnamese-comment/comments.csv'
data_path_2 = '/kaggle/input/vietnamese-comment/extra.csv' 

In [5]:
data_1 = pd.read_csv(data_path_1)
data_1.head()

Unnamed: 0,title,content,rating
0,C·ª±c k√¨ h√†i l√≤ng,ƒê∆∞·ª£c h·∫πn giao h√†ng th·ª© 4 m√† th·ª© 2 ƒë√£ c√≥ s√°ch t...,5
1,R·∫•t kh√¥ng h√†i l√≤ng,H√†ng gi·∫£. Gi·∫•y s√°ch qu√° t·ªá. Ch·ªØ in ko r√µ r√†ng...,1
2,C·ª±c k√¨ h√†i l√≤ng,"S√°ch ƒë·∫πp, ch·∫•t l∆∞·ª£ng gi·∫•y tuy·ªát v·ªùi, kh·ªï to, n...",5
3,C·ª±c k√¨ h√†i l√≤ng,"Giao h√†ng nhanh, ƒë√≥ng g√≥i c·∫©n th·∫≠n. C√≥ b√≥ng k√≠...",5
4,C·ª±c k√¨ h√†i l√≤ng,Giao nhanh v√† ƒë√≥ng g√≥i c·∫©n th·∫≠n. Thank shop,5


In [None]:
len(data_1) 

In [None]:
data_2 = pd.read_csv(data_path_2)
data_2.head()

In [None]:
len(data_2) 

In [6]:
# data_1 = data_1.drop(columns=['title'])
# data = pd.concat([data_1, data_2])

data = data_1

In [None]:
data.head(10) 

In [None]:
len(data) 

In [None]:
missing_values = data.isnull().sum()
print("D·ªØ li·ªáu b·ªã thi·∫øu:\n", missing_values)

In [None]:
duplicates = data.duplicated(subset=["content"]).sum()
print("S·ªë c√¢u tr√πng l·∫∑p:", duplicates)

# 2. Data Preprocessing

## 2.1 Data Cleaning

In [7]:
# data = data.dropna(subset=["content"])           # b·ªè comment r·ªóng
data = data.dropna(subset=['title', 'content']) 

In [None]:
len(data)

In [8]:
import unicodedata

In [9]:
# Dictionary for common Vietnamese slang/abbreviations
abbreviations = {
    "ko": "kh√¥ng",
    "sp": "s·∫£n ph·∫©m",
    "k": "kh√¥ng",
    "m": "m√¨nh",
    "ƒëc": "ƒë∆∞·ª£c",
    "dc": "ƒë∆∞·ª£c",
    "h": "gi·ªù",
    "trloi": "tr·∫£ l·ªùi",
    "cg": "c≈©ng",
    "bt": "b√¨nh th∆∞·ªùng",
    "dt": "ƒëi·ªán tho·∫°i",
    "mt": "m√°y t√≠nh",
    "m.n": "m·ªçi ng∆∞·ªùi"
    # add more slang mappings
}

# Regex patterns
url_pattern = r"http\S+|www\S+"  # URLs
user_pattern = r"@\w+"  # usernames
emoji_pattern = re.compile(
    "["  # start
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags
    "]+", flags=re.UNICODE)
emoticon_pattern = r"[:;=8][\-o\*']?[\)\]\(\[dDpP/:}\{@\|\\]"  # emoticons
repeat_pattern = re.compile(r"(.)\1{2,}")  # 3 or more repeats

In [10]:
def clean_text(text: str) -> str:
    # Unicode normalization
    text = str(text)
    text = unicodedata.normalize('NFC', text)  #¬†Chu·∫©n ho√° Unicode r√µ r√†ng¬†(cƒÉn b·∫£n)

    # Lowercase
    text = text.lower()

    # Remove URLs and usernames
    text = re.sub(url_pattern, '', text)
    text = re.sub(user_pattern, '', text)

    # Remove emojis and emoticons
    text = emoji_pattern.sub(' ', text)
    text = re.sub(emoticon_pattern, ' ', text)

    # Expand common abbreviations
    def expand(match):
        word = match.group(0)
        return abbreviations.get(word, word)

    if abbreviations:
        pattern = re.compile(r"\b(" + "|".join(map(re.escape, abbreviations.keys())) + r")\b")
        text = pattern.sub(expand, text)

    # Remove repeated characters (e.g., "quaaa" -> "qua" )
    text = repeat_pattern.sub(r"\1", text)

    # Remove punctuation (keep Vietnamese letters & numbers)
    text = re.sub(r"[^\w\s\u00C0-\u024F]", ' ', text)

    # Remove extra whitespace
    text = re.sub(r"\s+", ' ', text).strip()

    return text

In [None]:
sample = "Tui thik q√°√°√°!!! üòäüòä, thanks @ban http://example.com"
print(clean_text(sample))  # Expected: "tui th√≠ch qua c·∫£m ∆°n"

In [11]:
data["text"] = data["content"].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"] = data["content"].apply(clean_text)


In [None]:
data.head()

In [12]:
data = data.groupby('text', as_index=False)['rating'].mean()  

In [13]:
data['rating'] = np.floor(data['rating']).astype(int) 

In [14]:
duplicates = data.duplicated(subset=["text"]).sum()
print("S·ªë c√¢u tr√πng l·∫∑p sau x·ª≠ l√Ω:", duplicates) 

S·ªë c√¢u tr√πng l·∫∑p sau x·ª≠ l√Ω: 0


In [15]:
def sentiment(r):
    if r >= 4: return "t√≠ch c·ª±c"
    if r == 3: return "b√¨nh th∆∞·ªùng"
    return "ti√™u c·ª±c"
def label(r):
    if r >= 4: return 2
    if r == 3: return 1
    return 0
data["sentiment"] = data["rating"].apply(sentiment)
data["label"] = data["rating"].apply(label)

In [None]:
data.head()

## 2.2 Statistical Analysis

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("S·ªë l∆∞·ª£ng m·∫´u:", data.shape[0])

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=data["sentiment"], palette="coolwarm")
plt.title("Ph√¢n ph·ªëi nh√£n c·∫£m x√∫c")
plt.xlabel("C·∫£m x√∫c")
plt.ylabel("S·ªë l∆∞·ª£ng m·∫´u")
plt.show()

In [None]:
missing_values = data.isnull().sum()
print("D·ªØ li·ªáu b·ªã thi·∫øu:\n", missing_values)

In [None]:
label_counts = data["sentiment"].value_counts()
print("S·ªë l∆∞·ª£ng m·ªói nh√£n:\n", label_counts)

In [17]:
!pip install underthesea 



In [18]:
from underthesea import word_tokenize
import nltk
import wordcloud

In [19]:
data['corpus'] = data['text'].map(lambda text: word_tokenize(text, format="text")) 

In [None]:
data.sample(10) 

In [None]:
# Create bag of words
# Flatten the list of lists into a single list of words
all_words_flat = []
for tokens in data['corpus'].tolist():
    if tokens and tokens != '':
        all_words_flat.extend(tokens.split())

# Create FreqDist from the flattened list
all_words_dist = nltk.FreqDist(all_words_flat)

In [None]:
# Print the total number of words and the 15 most common words
print('T·ªïng s·ªë t·ª´: {}'.format(len(all_words_dist)))
print('T·ª´ xu·∫•t hi·ªán nhi·ªÅu: {}'.format(all_words_dist.most_common(15)))

In [None]:
corpus = " ".join(all_words_flat)
plt.figure(figsize=(12,8))
word_cloud = wordcloud.WordCloud(max_words=100, background_color ="black", width=2000, height=1000, mode="RGB").generate(corpus)
plt.axis("off")
plt.imshow(word_cloud)

## 2.3 Oversampling

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    data['text'],
    data['label'],
    test_size=0.2,
    stratify=data['label'],
    random_state=42
)

In [22]:
from sklearn.utils import resample

In [23]:
df_train = pd.DataFrame({
    'text': train_sentences,
    'label': train_labels
})

df_pos = df_train[df_train.label == 2]   # positive
df_neg = df_train[df_train.label == 0]   # negative
df_neu = df_train[df_train.label == 1]   # neutral

max_n = df_train.label.value_counts().max()

df_neg_up = resample(df_neg,
                     replace=True,
                     n_samples=max_n,
                     random_state=42)
df_neu_up = resample(df_neu,
                     replace=True,
                     n_samples=max_n,
                     random_state=42)

df_pos_up = df_pos

train_balanced = pd.concat([df_pos_up, df_neg_up, df_neu_up])
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

train_sentences = train_balanced['text']
train_labels    = train_balanced['label']

In [24]:
print(train_balanced['label'].value_counts())

label
0    27976
2    27976
1    27976
Name: count, dtype: int64


# 3. ML Model

## 3.1 Multinomial Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
vectorizer = TfidfVectorizer()

train_sentences_tfidf = vectorizer.fit_transform(train_sentences)
test_sentences_tfidf = vectorizer.transform(test_sentences)

In [None]:
model = MultinomialNB()
model.fit(train_sentences_tfidf, train_labels)

In [None]:
pred = model.predict(test_sentences_tfidf)

In [None]:
print("Accuracy:", accuracy_score(test_labels, pred))
print("Classification Report:")
print(classification_report(test_labels, pred))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, pred))

In [None]:
label2sen = {
    0: "Ti√™u c·ª±c",
    1: "B√¨nh th∆∞·ªùng",
    2: "T√≠ch c·ª±c"
}
while True:
  input_text = input("Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra: ")
  if input_text.strip().lower() != "tho√°t":
    input_text_tfidf = vectorizer.transform([clean_text(input_text)])
    prediction = model.predict(input_text_tfidf)
    print("K·∫øt qu·∫£ d·ª± ƒëo√°n: " + label2sen[prediction[0]] + "\n")
  else:
    print("Ch√∫c m·ªôt ng√†y t·ªët l√†nh !")
    break

# 4. DL Model

## 4.1 Word Embedding

In [25]:
!pip install torch==2.2.0 



In [26]:
!pip install torchtext==0.17.0 



In [27]:
import torch 
import torchtext.vocab as vocab

In [28]:
# input_path = '/kaggle/input/vietnamese-comment/vi_word2vec.txt'
# output_path = '/kaggle/working/vi_word2vec_reduced.txt' 
# max_lines = 100000  # S·ªë d√≤ng b·∫°n mu·ªën gi·ªØ l·∫°i 

In [29]:
# with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
#     for i, line in enumerate(infile):
#         if i > max_lines:
#             break
#         outfile.write(line) 

In [30]:
word_embedding = vocab.Vectors(name = '/kaggle/working/vi_word2vec_reduced.txt', unk_init = torch.Tensor.normal_)
word_embedding.vectors.shape

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100000/100000 [00:04<00:00, 23040.19it/s]


torch.Size([100000, 100])

In [31]:
def get_vector(embeddings, word):
    assert word in embeddings.stoi, f'*{word}* is not in the vocab!'
    return embeddings.vectors[embeddings.stoi[word]]

def closest_words(embeddings, vector, n=10):
    distances = [(word, torch.dist(vector, get_vector(embeddings, word)).item())
                 for word in embeddings.itos]

    return sorted(distances, key = lambda w: w[1])[:n]

In [32]:
word_vector = get_vector(word_embedding, "L·∫°c_Long_Qu√¢n")

closest_words(word_embedding, word_vector, n=20)

[('L·∫°c_Long_Qu√¢n', 0.0),
 ('√Çu_C∆°', 0.44687357544898987),
 ('An_D∆∞∆°ng_V∆∞∆°ng', 0.5421355962753296),
 ('Ho√†ng_Hoa_Th√°m', 0.5765705704689026),
 ('Thu·ªµ_Khu√™', 0.6254850029945374),
 ('L√™_ƒê·∫°i_H√†nh', 0.6597759127616882),
 ('Nguy·ªÖn_Ho√†ng_T√¥n', 0.6894202828407288),
 ('L√Ω_Th√°i_T·ªï', 0.690030574798584),
 ('Xu√¢n_La', 0.6982914805412292),
 ('Nguy·ªÖn_Tr√£i', 0.7008039355278015),
 ('Kinh_D∆∞∆°ng_V∆∞∆°ng', 0.7009497880935669),
 ('Ho√†ng_VƒÉn_Th·ª•', 0.7065233588218689),
 ('Tr∆∞∆°ng_ƒê·ªãnh', 0.71026211977005),
 ('Nghi_T√†m', 0.711430013179779),
 ('T√¢y_H·ªì', 0.714965283870697),
 ('Li·ªÖu_Giai', 0.7252614498138428),
 ('Thu·ªµ_Kh√™', 0.7305408716201782),
 ('Nguy·ªÖn_VƒÉn_C·ª´', 0.7348453402519226),
 ('L√Ω_Th∆∞·ªùng_Ki·ªát', 0.7370755672454834),
 ('Linh_Lang', 0.7411515712738037)]

## 4.2 Vocabulary Class

In [33]:
from tqdm import tqdm

In [34]:
class Vocabulary:
    def __init__(self):
        self.word2id = dict()
        self.word2id['<pad>'] = 0   # Pad Token
        self.word2id['<unk>'] = 1   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __len__(self):
        return len(self.word2id)

    def id2word(self, word_index):
        return self.id2word[word_index]

    def add(self, word):
        if word not in self:
            word_index = self.word2id[word] = len(self.word2id)
            self.id2word[word_index] = word
            return word_index
        else:
            return self[word]

    @staticmethod
    def tokenize_corpus(corpus):
        print("Tokenize the corpus...")
        tokenized_corpus = list()
        for document in tqdm(corpus):
            tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
            tokenized_corpus.append(tokenized_document)

        return tokenized_corpus

    def corpus_to_tensor(self, corpus, is_tokenized=False):
        if is_tokenized:
            tokenized_corpus = corpus
        else:
            tokenized_corpus = self.tokenize_corpus(corpus)
        indicies_corpus = list()
        for document in tqdm(tokenized_corpus):
            indicies_document = torch.tensor(list(map(lambda word: self[word], document)),
                                             dtype=torch.int64)
            indicies_corpus.append(indicies_document)

        return indicies_corpus

    def tensor_to_corpus(self, tensor):
        corpus = list()
        for indicies in tqdm(tensor):
            document = list(map(lambda index: self.id2word[index.item()], indicies))
            corpus.append(document)

        return corpus

    # def add_words_from_corpus(self, corpus, is_tokenized=False):
    #     print("Add words from the corpus...")
    #     if is_tokenized:
    #         tokenized_corpus = corpus
    #     else:
    #         tokenized_corpus = self.tokenize_corpus(corpus)
    #     word_freq = Counter(chain(*tokenized_corpus))
    #     non_singletons = [w for w in word_freq if word_freq[w] > 1]
    #     print(f"Number of words in the corpus: {len(word_freq)}")
    #     print(f"Number of words with frequency > 1: {len(non_singletons)}")
    #     for word in non_singletons:
    #         self.add(word)

In [35]:
corpus_sample = ["ƒê·∫πp l·∫Øm mn ∆°i k ng·ªù fahasa b√°n alb th∆° n√†y c·ªßa Lana lun, bh kh√≥ mua l·∫Øm",
                 "Shop giao h√†ng nhanh, ƒë√≥ng g√≥i h√†ng c·∫©n th·∫≠n. M·∫∑c d√π s√°ch c√≥ b√© h∆°n m√¨nh nghƒ© nh∆∞ng shop r√¢ÃÅt chu ƒëaÃÅo. V√¨ m√¨nh mua g·∫ßn t·∫øt n√™n c√≥ ƒëc t·∫∑ng th√™m c·∫£ l√¨ x√¨ n·ªØa. R·∫•t ƒë√°ng ti·ªÅn. Mn mua ·ªßng h·ªô shop nheÃÅ.",
                 "l·∫ßn ƒë·∫ßu mua nh∆∞ng ok l·∫Øm lu√¥n s·∫Øp t·∫øt n√™n ƒëc t·∫∑ng t·∫≠p l√¨ x√¨ s√°ch nh·ªè nh∆∞ng b·ªçc h·ªôp ƒë·∫ßy ƒë·ªß ƒë·∫∑c bi·ªát t·∫∑ng c·∫£ voucher cho l·∫ßn sau ch·ªâ c√≥ c√°i s√°ch ƒë∆∞·ª£c b·ªçc b·∫±ng m√†ng th·ª±c ph·∫©m"]

Vocabulary.tokenize_corpus(corpus_sample)

Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 382.40it/s]


[['ƒê·∫πp',
  'l·∫Øm',
  'mn',
  '∆°i',
  'k',
  'ng·ªù',
  'fahasa',
  'b√°n',
  'alb_th∆°',
  'n√†y',
  'c·ªßa',
  'Lana_lun',
  ',',
  'bh',
  'kh√≥',
  'mua',
  'l·∫Øm'],
 ['Shop',
  'giao',
  'h√†ng',
  'nhanh',
  ',',
  'ƒë√≥ng_g√≥i',
  'h√†ng',
  'c·∫©n_th·∫≠n',
  '.',
  'M·∫∑c_d√π',
  's√°ch',
  'c√≥',
  'b√©',
  'h∆°n',
  'm√¨nh',
  'nghƒ©',
  'nh∆∞ng',
  'shop',
  'r·∫•t',
  'chu_ƒë√°o',
  '.',
  'V√¨',
  'm√¨nh',
  'mua',
  'g·∫ßn',
  't·∫øt',
  'n√™n',
  'c√≥',
  'ƒëc',
  't·∫∑ng',
  'th√™m',
  'c·∫£',
  'l√¨_x√¨',
  'n·ªØa',
  '.',
  'R·∫•t',
  'ƒë√°ng',
  'ti·ªÅn',
  '.',
  'Mn',
  'mua',
  '·ªßng_h·ªô',
  'shop',
  'nh√©',
  '.'],
 ['l·∫ßn',
  'ƒë·∫ßu',
  'mua',
  'nh∆∞ng',
  'ok',
  'l·∫Øm',
  'lu√¥n',
  's·∫Øp',
  't·∫øt',
  'n√™n',
  'ƒëc',
  't·∫∑ng',
  't·∫≠p_l√¨',
  'x√¨',
  's√°ch',
  'nh·ªè',
  'nh∆∞ng',
  'b·ªçc_h·ªôp',
  'ƒë·∫ßy_ƒë·ªß',
  'ƒë·∫∑c_bi·ªát',
  't·∫∑ng',
  'c·∫£',
  'voucher',
  'cho',
  'l·∫ßn',
  'sau',
  'ch·ªâ',
  'c√≥',
  'c√°i',
  's√°ch',
  

In [36]:
vocab = Vocabulary()

# create vocabulary from pretrained word2vec
words_list = list(word_embedding.stoi.keys())
for word in words_list:
    vocab.add(word)

# test the vocabulary
tensor = vocab.corpus_to_tensor(corpus_sample)
corpus = vocab.tensor_to_corpus(tensor)
" ".join(corpus[0])

Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 500.31it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 12658.87it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 3850.34it/s]


'ƒê·∫πp l·∫Øm <unk> ∆°i k ng·ªù <unk> b√°n <unk> n√†y c·ªßa <unk> , <unk> kh√≥ mua l·∫Øm'

## 4.3 CommentDataset Class

In [37]:
from scipy.linalg.special_matrices import dft
from torch.utils.data import Dataset

  from scipy.linalg.special_matrices import dft


In [38]:
class CommentDataset(Dataset):

    def __init__(self, vocab, df, tokenized_fpath=None):
        self.vocab = vocab
        self.pad_idx = vocab["<pad>"]
        df = df
        self.sentiments_list = list(df.label)
        self.reviews_list = list(df.text)

        sentiments_type = list(set(self.sentiments_list))
        sentiments_type.sort()

        self.sentiment2id = {sentiment: i for i, sentiment in enumerate(sentiments_type)}

        if tokenized_fpath:
            self.tokenized_reviews = torch.load(tokenized_fpath)
        else:
            self.tokenized_reviews = self.vocab.tokenize_corpus(self.reviews_list)

        self.tensor_data = self.vocab.corpus_to_tensor(self.tokenized_reviews, is_tokenized=True)
        self.tensor_label = torch.tensor([self.sentiment2id[sentiment] for sentiment in self.sentiments_list],
                                         dtype=torch.float64)

        self.tensor_data, self.tensor_label = zip(*[(data, label) for data, label in zip(self.tensor_data, self.tensor_label) if len(data) > 0])
        self.tensor_data = list(self.tensor_data)
        self.tensor_label = torch.tensor(self.tensor_label, dtype=torch.float64) # Convert back to tensor

    def __len__(self):
        return len(self.tensor_data)

    def __getitem__(self, idx):
        return self.tensor_data[idx], self.tensor_label[idx]

    def collate_fn(self, examples):
        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)

        reviews = [e[0] for e in examples]
        reviews = torch.nn.utils.rnn.pad_sequence(reviews,
                                                  batch_first=False,
                                                  padding_value=self.pad_idx)
        reviews_lengths = torch.tensor([len(e[0]) for e in examples])
        sentiments = torch.tensor([e[1] for e in examples])

        return {"reviews": (reviews, reviews_lengths), "sentiments": sentiments}

In [39]:
valid_df = train_balanced.sample(frac=0.2, random_state=42).reset_index()
train_df = train_balanced.drop(valid_df.index).reset_index()
test_df = pd.DataFrame({
    'text': test_sentences,
    'label': test_labels
}).reset_index()

In [40]:
valid_df['label'].value_counts()

label
0    5644
2    5588
1    5554
Name: count, dtype: int64

In [41]:
 valid_df.drop(columns=['index'], inplace=True)
 train_df.drop(columns=['index'], inplace=True)
 test_df.drop(columns=['index'], inplace=True)

In [42]:
train_dataset = CommentDataset(vocab, train_df)
valid_dataset = CommentDataset(vocab, valid_df)
test_dataset = CommentDataset(vocab, test_df)

Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 67142/67142 [01:52<00:00, 596.02it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 67142/67142 [00:01<00:00, 52349.41it/s]


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16786/16786 [00:31<00:00, 535.20it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16786/16786 [00:00<00:00, 69039.77it/s]


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8756/8756 [00:16<00:00, 527.51it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8756/8756 [00:00<00:00, 63752.27it/s]


## 4.4 Create DataLoader from IMDBDataset

In [43]:
from torch.utils.data import DataLoader

In [44]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate_fn)
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, collate_fn=valid_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, collate_fn=test_dataset.collate_fn)

## 4.5 RNN Model

In [45]:
import torch.nn as nn

In [46]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,
                 bidirectional, dropout, pad_idx, n_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), n_classes)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths.to('cpu'), enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
        else:
            hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

In [47]:
input_dim = word_embedding.vectors.shape[0] 
embedding_dim = 100
hidden_dim = 8  
n_layers = 2
bidirectional = False 
dropout = 0.3 
pad_idx = vocab["<pad>"]
unk_idx = vocab["<unk>"]
n_classes = 3  # positive, neutral, negative

model = RNN(input_dim, embedding_dim, hidden_dim, n_layers, bidirectional, dropout, pad_idx, n_classes)

In [48]:
model.embedding.weight.data.copy_(word_embedding.vectors)
model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [49]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,004,123 trainable parameters


## 4.6 Train the model

In [50]:
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device) 

model = model.to(device)

In [52]:
def compute_metrics_1(preds, labels):
    acc = accuracy_score(labels, preds)
    return acc 

In [53]:
def compute_metrics_2(preds, labels):
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    return acc, precision, recall, f1

In [54]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    all_preds, all_labels = [], []

    for batch in dataloader:
        optimizer.zero_grad()
        reviews, lengths = batch['reviews']
        reviews, lengths = reviews.to(device), lengths.to(device)
        logits = model(reviews, lengths)
        labels = batch['sentiments'].long().squeeze(-1).to(device)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        preds = logits.argmax(dim=1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().tolist())

    acc = compute_metrics_1(all_preds, all_labels)
    return epoch_loss / len(dataloader), acc 

In [55]:
def evaluate_1(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            reviews, lengths = batch['reviews']
            reviews, lengths = reviews.to(device), lengths.to(device)
            logits = model(reviews, lengths)
            labels = batch['sentiments'].long().squeeze(-1).to(device)

            loss = criterion(logits, labels)
            epoch_loss += loss.item()

            preds = logits.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().tolist())

    acc = compute_metrics_1(all_preds, all_labels)
    return epoch_loss / len(dataloader), acc

In [56]:
def evaluate_2(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            reviews, lengths = batch['reviews']
            reviews, lengths = reviews.to(device), lengths.to(device)
            logits = model(reviews, lengths)
            labels = batch['sentiments'].long().squeeze(-1).to(device)

            loss = criterion(logits, labels)
            epoch_loss += loss.item()

            preds = logits.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().tolist())

    acc, precision, recall, f1 = compute_metrics_2(all_preds, all_labels)
    return epoch_loss / len(dataloader), acc, precision, recall, f1

In [57]:
import time

In [58]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [59]:
n_epochs = 5

best_valid_loss = float("inf")

for epoch in range(n_epochs):
    start_time = time.time()

    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate_1(model, valid_dataloader, criterion, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"  Train - loss: {train_loss:.3f}| acc: {train_acc:.2f}")
    print(f"  Valid - loss: {valid_loss:.3f}| acc: {valid_acc:.2f}")

Epoch: 01 | Time: 0m 31s
  Train - loss: 0.711| acc: 0.71
  Valid - loss: 0.443| acc: 0.84
Epoch: 02 | Time: 0m 31s
  Train - loss: 0.447| acc: 0.85
  Valid - loss: 0.299| acc: 0.90
Epoch: 03 | Time: 0m 31s
  Train - loss: 0.336| acc: 0.90
  Valid - loss: 0.234| acc: 0.93
Epoch: 04 | Time: 0m 31s
  Train - loss: 0.279| acc: 0.92
  Valid - loss: 0.181| acc: 0.95
Epoch: 05 | Time: 0m 31s
  Train - loss: 0.239| acc: 0.93
  Valid - loss: 0.153| acc: 0.96


## 4.7 Test the model 

In [60]:
test_loss, test_acc, test_prec, test_rec, test_f1 = evaluate_2(model, test_dataloader, criterion, device)

print(f"Test - loss: {test_loss:.3f}| acc: {test_acc:.2f}| prec: {test_prec:.2f}| rec: {test_rec:.2f}| f1: {test_f1:.2f}")

Test - loss: 0.655| acc: 0.83| prec: 0.85| rec: 0.83| f1: 0.84


In [61]:
import torch.nn.functional as F 

In [62]:
def predict_sentiment(model, sentence, vocab, device, label_mapping=None):
    model.eval()

    # Convert sentence to tensor of token indices
    corpus = [sentence]
    tensor = vocab.corpus_to_tensor(corpus)[0].to(device)        # [seq_len]
    tensor = tensor.unsqueeze(1)                                 # [seq_len, 1]
    length_tensor = torch.LongTensor([tensor.size(0)]).to(device)

    # Forward pass
    with torch.no_grad():
        logits = model(tensor, length_tensor).squeeze(0)         # [n_classes]
        probs = F.softmax(logits, dim=-1)                       # [n_classes]

    # Predicted class index and optional label name
    pred_idx = probs.argmax().item()
    pred_label = label_mapping[pred_idx] if label_mapping is not None else str(pred_idx)

    # Return index, label, and full probability distribution
    return pred_label, probs.cpu().tolist()

In [63]:
label_map = {0: 'ti√™u c·ª±c', 1: 'b√¨nh th∆∞·ªùng', 2: 't√≠ch c·ª±c'}

while True:
  input_text = input("Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra: ")
  if input_text.strip().lower() != "tho√°t":
    label, probs = predict_sentiment(
        model=model,
        sentence=clean_text(input_text),
        vocab=vocab,
        device=device,
        label_mapping=label_map
    )
    print(f"\nD·ª± ƒëo√°n c·∫£m x√∫c: {label}")
  else:
    print("Ch√∫c m·ªôt ng√†y t·ªët l√†nh!")
    break

Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra:  ·ªïn


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3504.01it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6223.00it/s]



D·ª± ƒëo√°n c·∫£m x√∫c: t√≠ch c·ª±c


Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra:  ok


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3809.54it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 7463.17it/s]



D·ª± ƒëo√°n c·∫£m x√∫c: t√≠ch c·ª±c


Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra:  c≈©ng ƒë∆∞·ª£c


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 2562.19it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6584.46it/s]



D·ª± ƒëo√°n c·∫£m x√∫c: t√≠ch c·ª±c


Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra:  end


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3615.78it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 5932.54it/s]



D·ª± ƒëo√°n c·∫£m x√∫c: t√≠ch c·ª±c


Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra:  k·∫øt th√∫c


Tokenize the corpus...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 2970.47it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6864.65it/s]



D·ª± ƒëo√°n c·∫£m x√∫c: t√≠ch c·ª±c


Nh·∫≠p c√¢u c·∫ßn ki·ªÉm tra:  tho√°t


Ch√∫c m·ªôt ng√†y t·ªët l√†nh!
