In [1]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import label_binarize
from itertools import cycle
from underthesea import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"ƒêang ch·∫°y tr√™n thi·∫øt b·ªã: {device}")

MODEL_NAME = "vinai/phobert-large" 
MAX_LENGTH = 128

ƒêang ch·∫°y tr√™n thi·∫øt b·ªã: cpu


In [2]:
file_path = 'data/comments.csv'
if not os.path.exists(file_path):
    print(f"Kh√¥ng t√¨m th·∫•y file {file_path}. Vui l√≤ng ki·ªÉm tra l·∫°i!")
else:
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
    except:
        df = pd.read_csv(file_path, encoding='utf-8-sig')

    df = df[['rating', 'content']].dropna()
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df.dropna(subset=['rating'], inplace=True)

    def map_label(rating):
        if rating in [4, 5]: return 2
        if rating == 3: return 1
        return 0

    df['label'] = df['rating'].apply(map_label)
    df['label'] = df['label'].astype(int)

    print("ƒêang t√°ch t·ª´ (Word Segmentation)...")
    df['content_seg'] = df['content'].apply(lambda x: word_tokenize(str(x), format="text"))

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

    train_dataset = Dataset.from_pandas(train_df[['content_seg', 'label']])
    test_dataset = Dataset.from_pandas(test_df[['content_seg', 'label']])

    print(f"ƒê√£ x·ª≠ l√Ω xong. Train: {len(train_df)} - Test: {len(test_df)}")

ƒêang t√°ch t·ª´ (Word Segmentation)...


KeyboardInterrupt: 

In [5]:
print(f"ƒêang t·∫£i Tokenizer: {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["content_seg"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

print("ƒêang m√£ h√≥a d·ªØ li·ªáu...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
print("M√£ h√≥a xong!")

ƒêang t·∫£i Tokenizer: vinai/phobert-large...
ƒêang m√£ h√≥a d·ªØ li·ªáu...


Map:   0%|          | 0/82610 [00:00<?, ? examples/s]

Map:   0%|          | 0/20653 [00:00<?, ? examples/s]

M√£ h√≥a xong!


In [6]:
print(f"ƒêang t·∫£i Model: {MODEL_NAME}...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    return {"accuracy": acc, "f1_macro": f1}

training_args = TrainingArguments(
    output_dir="./results_phobert",
    num_train_epochs=5,             
    per_device_train_batch_size=4,   
    gradient_accumulation_steps=4,   
    learning_rate=1e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,     
    metric_for_best_model="f1_macro",
    report_to="none",                
    no_cuda=False if torch.cuda.is_available() else True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)
print("ƒê√£ c·∫•u h√¨nh xong Trainer!")

ƒêang t·∫£i Model: vinai/phobert-large...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ƒê√£ c·∫•u h√¨nh xong Trainer!


In [None]:
print("B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN...")
trainer.train()
print("Hu·∫•n luy·ªán ho√†n t·∫•t!")

B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN...


Epoch,Training Loss,Validation Loss


In [None]:
print("ƒêang t√≠nh to√°n v√† v·∫Ω bi·ªÉu ƒë·ªì...")

history = trainer.state.log_history
train_loss = [x['loss'] for x in history if 'loss' in x]
eval_f1 = [x['eval_f1_macro'] for x in history if 'eval_f1_macro' in x]

preds_output = trainer.predict(tokenized_test)
logits = preds_output.predictions
y_true = preds_output.label_ids
y_pred = np.argmax(logits, axis=-1)
y_prob = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()

target_names = ['Negative üò°', 'Neutral üòê', 'Positive üòç']

plt.figure(figsize=(20, 12))

plt.subplot(2, 3, 1)
plt.plot(train_loss, label='Train Loss', color='tab:blue')
plt.title('Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 3, 2)
plt.plot(eval_f1, label='Val F1-Macro', color='tab:green', marker='o')
plt.title('Validation F1 Score')
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.legend()

cm = confusion_matrix(y_true, y_pred)
plt.subplot(2, 3, 3)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.ylabel('Th·ª±c t·∫ø')
plt.xlabel('D·ª± ƒëo√°n')

plt.subplot(2, 3, 4)
y_test_bin = label_binarize(y_true, classes=[0, 1, 2])
n_classes = 3
colors = cycle(['#ff7f0e', '#2ca02c', '#1f77b4'])
for i, color in zip(range(n_classes), colors):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=color, lw=2, label=f'{target_names[i]} (AUC={roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")

plt.subplot(2, 3, 5)
for i, color in zip(range(n_classes), colors):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_prob[:, i])
    plt.plot(recall, precision, color=color, lw=2, label=f'{target_names[i]}')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()

plt.subplot(2, 3, 6)
max_probs = np.max(y_prob, axis=1)
correct_indices = np.where(y_pred == y_true)[0]
incorrect_indices = np.where(y_pred != y_true)[0]
plt.hist(max_probs[correct_indices], bins=20, color='green', alpha=0.5, label='ƒê√∫ng')
plt.hist(max_probs[incorrect_indices], bins=20, color='red', alpha=0.5, label='Sai')
plt.title('Confidence Distribution')
plt.legend()

plt.tight_layout()
plt.show()

print("\n--- CLASSIFICATION REPORT ---")
print(classification_report(y_true, y_pred, target_names=target_names))

In [None]:
save_path = "./my-phobert-sentiment-final"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"ƒê√£ l∆∞u model th√†nh c√¥ng t·∫°i: {save_path}")
print("B·∫°n c√≥ th·ªÉ ch·∫°y file 'app.py' ƒë·ªÉ s·ª≠ d·ª•ng model n√†y!")