In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

# Xử lý dữ liệu

In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/BERT-SentimentAnalysis/data/twitter_training.csv')
df_val = pd.read_csv('/content/drive/MyDrive/BERT-SentimentAnalysis/data/twitter_validation.csv')

# Kết hợp 2 data
df = pd.concat([df_train, df_val], ignore_index=False)
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,2401.0,Borderlands,Positive,I am coming to the borders and I will kill you...,,,,
1,2401.0,Borderlands,Positive,im getting on borderlands and i will kill you ...,,,,
2,2401.0,Borderlands,Positive,im coming on borderlands and i will murder you...,,,,
3,2401.0,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,,,,
4,2401.0,Borderlands,Positive,im getting into borderlands and i can murder y...,,,,
...,...,...,...,...,...,...,...,...
994,,,,,4891.0,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,,,,,4359.0,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,,,,,2652.0,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,,,,,8069.0,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [4]:
# Chỉ giữ lại các cột cần
columns_to_drop = ['2401', 'Borderlands', '3364', 'Facebook', 'Irrelevant',
                   'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣']
df = df.drop(columns_to_drop, axis=1)
df = df.rename(columns={'im getting on borderlands and i will murder you all ,': 'Tweet', 'Positive': 'Sentiment'})
df

Unnamed: 0,Sentiment,Tweet
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
994,,
995,,
996,,
997,,


In [5]:
# Xóa các hàng trống
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0,0
Sentiment,0
Tweet,0


In [6]:
# Xóa các hàng trùng lặp
df = df.drop_duplicates()
df.duplicated().sum()

0

In [7]:
df

Unnamed: 0,Sentiment,Tweet
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


# Huấn luyện mô hình

In [8]:
# Chuẩn bị dữ liệu
X = df['Tweet']
Y = df['Sentiment']

In [None]:
X

Unnamed: 0,Tweet
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...
...,...
74676,Just realized that the Windows partition of my...
74677,Just realized that my Mac window partition is ...
74678,Just realized the windows partition of my Mac ...
74679,Just realized between the windows partition of...


In [None]:
Y

Unnamed: 0,Sentiment
0,Positive
1,Positive
2,Positive
3,Positive
4,Positive
...,...
74676,Positive
74677,Positive
74678,Positive
74679,Positive


In [None]:
# Ánh xạ nhãn thành số
label_dict = {'Positive': 0, 'Negative': 1, 'Neutral': 2, 'Irrelevant': 3}
labels = np.array([label_dict[label] for label in Y])
labels

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Chia tập dữ liệu
x_train, x_val, y_train, y_val = train_test_split(X, labels, test_size=0.1, random_state=42)

In [None]:
# Khởi tạo tokenizer và mô hình
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize và chuẩn bị dữ liệu
def tokenize_and_encode(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_encodings = tokenize_and_encode(x_train)
val_encodings = tokenize_and_encode(x_val)

In [None]:
train_encodings

{'input_ids': tensor([[  101,  1030, 14826,  ...,     0,     0,     0],
        [  101,  1030,  8879,  ...,     0,     0,     0],
        [  101,  2069,  1037,  ...,     0,     0,     0],
        ...,
        [  101,  1045, 16755,  ...,     0,     0,     0],
        [  101,   100,  6402,  ...,     0,     0,     0],
        [  101,  2026, 16437,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
val_encodings

{'input_ids': tensor([[  101,  2064,  2062,  ...,     0,     0,     0],
        [  101,  1030, 23564,  ...,     0,     0,     0],
        [  101,  2524,  4061,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2123,  ...,     0,     0,     0],
        [  101,  1045,  1521,  ...,     0,     0,     0],
        [  101,  1030,  2377,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
# Tạo DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val))

batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [None]:
# Huấn luyện mô hình
optimizer = AdamW(model.parameters(), lr=2e-5)

# Chuyển mô hình sang GPU nếu có
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('-' * 40)

    # Training
    model.train()
    train_loss = 0
    train_steps = 0
    train_preds = []
    train_true = []

    for batch in tqdm(train_dataloader, desc="Training"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits

        train_loss += loss.item()
        loss.backward()
        optimizer.step()

        train_steps += 1
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        train_preds.extend(preds)
        train_true.extend(inputs['labels'].detach().cpu().numpy())

    avg_train_loss = train_loss / train_steps
    train_accuracy = (np.array(train_preds) == np.array(train_true)).mean()

    print(f"Average training loss: {avg_train_loss:.4f}")
    print(f"Training accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    val_steps = 0
    val_preds = []
    val_true = []

    for batch in tqdm(val_dataloader, desc="Validation"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()
        val_steps += 1
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        val_preds.extend(preds)
        val_true.extend(inputs['labels'].detach().cpu().numpy())

    avg_val_loss = val_loss / val_steps
    val_accuracy = (np.array(val_preds) == np.array(val_true)).mean()

    print(f"Average validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")

    print("\nClassification Report (Validation):")
    print(classification_report(val_true, val_preds, target_names=list(label_dict.keys())))
    print("\n")

# Lưu mô hình
print("Saving model...")
save_path = '/content/drive/MyDrive/BERT-SentimentAnalysis/model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")



Epoch 1/3
----------------------------------------


Training: 100%|██████████| 1996/1996 [20:36<00:00,  1.61it/s]


Average training loss: 0.8161
Training accuracy: 0.6761


Validation: 100%|██████████| 222/222 [00:46<00:00,  4.79it/s]


Average validation loss: 0.4985
Validation accuracy: 0.8176

Classification Report (Validation):
              precision    recall  f1-score   support

    Positive       0.79      0.86      0.82      1951
    Negative       0.85      0.88      0.86      2132
     Neutral       0.79      0.78      0.79      1737
  Irrelevant       0.85      0.70      0.77      1276

    accuracy                           0.82      7096
   macro avg       0.82      0.81      0.81      7096
weighted avg       0.82      0.82      0.82      7096



Epoch 2/3
----------------------------------------


Training: 100%|██████████| 1996/1996 [20:41<00:00,  1.61it/s]


Average training loss: 0.3186
Training accuracy: 0.8859


Validation: 100%|██████████| 222/222 [00:46<00:00,  4.80it/s]


Average validation loss: 0.2786
Validation accuracy: 0.9040

Classification Report (Validation):
              precision    recall  f1-score   support

    Positive       0.86      0.94      0.90      1951
    Negative       0.93      0.90      0.92      2132
     Neutral       0.90      0.91      0.91      1737
  Irrelevant       0.95      0.84      0.89      1276

    accuracy                           0.90      7096
   macro avg       0.91      0.90      0.90      7096
weighted avg       0.91      0.90      0.90      7096



Epoch 3/3
----------------------------------------


Training: 100%|██████████| 1996/1996 [20:40<00:00,  1.61it/s]


Average training loss: 0.1387
Training accuracy: 0.9495


Validation: 100%|██████████| 222/222 [00:46<00:00,  4.80it/s]


Average validation loss: 0.2429
Validation accuracy: 0.9208

Classification Report (Validation):
              precision    recall  f1-score   support

    Positive       0.89      0.94      0.91      1951
    Negative       0.93      0.94      0.93      2132
     Neutral       0.93      0.92      0.92      1737
  Irrelevant       0.95      0.87      0.91      1276

    accuracy                           0.92      7096
   macro avg       0.92      0.92      0.92      7096
weighted avg       0.92      0.92      0.92      7096



Saving model...
Model saved to /content/drive/MyDrive/BERT-SentimentAnalysis/model


# Thử nghiệm mô hình

In [None]:
model_path = '/content/drive/MyDrive/BERT-SentimentAnalysis/model'

# Tải tokenizer và mô hình vừa huấn luyện
loaded_tokenizer = BertTokenizer.from_pretrained(model_path)
loaded_model = BertForSequenceClassification.from_pretrained(model_path)

print("Model loaded successfully!")

Model loaded successfully!


In [None]:
# Hàm dự đoán
def predict_sentiment(text):
    # Tokenize input
    inputs = loaded_tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Dự đoán
    loaded_model.eval()
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Ánh xạ ngược lại từ số thành nhãn
    label_dict = {0: 'Positive', 1: 'Negative', 2: 'Neutral', 3: 'Irrelevant'}
    return label_dict[predicted_class]

In [None]:
sample_texts = [
    "I love this product! It's amazing!",
    "This is the worst experience I've ever had.",
    "The weather is nice today.",
    "I don't have any strong feelings about this.",
    "Breaking news: Major event happened in the city center."
    "I'm extremely satisfied with my purchase, it's even better than I expected.",
    "This product is a complete waste of money and time.",
    "The customer service was excellent and exceeded my expectations.",
    "The sky is blue and the grass is green.",
    "I'm planning to go grocery shopping tomorrow morning."
]

# Thử nghiệm mô hình
for text in sample_texts:
    sentiment = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Predicted sentiment: {sentiment}")
    print("-" * 50)

Text: I love this product! It's amazing!
Predicted sentiment: Positive
--------------------------------------------------
Text: This is the worst experience I've ever had.
Predicted sentiment: Negative
--------------------------------------------------
Text: The weather is nice today.
Predicted sentiment: Positive
--------------------------------------------------
Text: I don't have any strong feelings about this.
Predicted sentiment: Negative
--------------------------------------------------
Text: Breaking news: Major event happened in the city center.I'm extremely satisfied with my purchase, it's even better than I expected.
Predicted sentiment: Positive
--------------------------------------------------
Text: This product is a complete waste of money and time.
Predicted sentiment: Negative
--------------------------------------------------
Text: The customer service was excellent and exceeded my expectations.
Predicted sentiment: Positive
-------------------------------------------