### **Chuẩn bị dữ liệu**

In [1]:
import pandas as pd
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# Đọc dữ liệu từ file CSV
data = pd.read_csv('data.csv', encoding='utf-8')
data['label'] = data['label'].str.replace('"', '', regex=False)
data['label'] = data['label'].str.strip()
data['label'] = data['label'].replace({9})

### **Tiền xử lí dữ liệu**


In [2]:
import emoji

# loại bỏ emoji trong comment
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

data['comment'] = data['comment'].apply(remove_emoji)

### **Chia dữ liệu ra tập Train và Test**

In [3]:
from sklearn.model_selection import train_test_split

# Chia dữ liệu thành tập train (80%) và test (20%)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
from datasets import Dataset, DatasetDict

# Chuyển đổi DataFrame sang Dataset của Hugging Face
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Kết hợp chúng thành một DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


### **Tokenize dữ liệu văn bản**

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
tokenizer.save_pretrained("./sentiment-analysis-base-phobert")


def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/15082 [00:00<?, ? examples/s]

Map:   0%|          | 0/3771 [00:00<?, ? examples/s]

In [6]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

### **Huấn luyện mô hình với PyTorch Trainer**

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=3)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

In [8]:
import numpy as np
import evaluate

# Sử dụng độ lỗi F1
metric = evaluate.load("accuracy")

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(num_train_epochs=3,
                                per_device_train_batch_size=16,
                                per_device_eval_batch_size=16,
                                learning_rate=5e-5,
                                eval_strategy="epoch",
                                output_dir="sentiment-analysis-base-pho-bert")

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.727059,0.655
2,No log,0.596945,0.777
3,No log,0.582565,0.774


TrainOutput(global_step=189, training_loss=0.5985075128141535, metrics={'train_runtime': 5896.2324, 'train_samples_per_second': 0.509, 'train_steps_per_second': 0.032, 'total_flos': 394670126592000.0, 'train_loss': 0.5985075128141535, 'epoch': 3.0})

In [13]:
trainer.model.save_pretrained("./sentiment-analysis-base-phobert")

In [14]:
from huggingface_hub import HfApi

repo_name = "lamsytan/sentiment-analysis-product-comment" 
token = "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"  

api = HfApi()

api.upload_folder(
    folder_path="./sentiment-analysis-base-phobert", 
    repo_id=repo_name, 
    repo_type="model", 
    token=token
)

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lamsytan/sentiment-analysis-product-comment/commit/709db99f20152c6012bc4c705c63a71b9dd2ac00', commit_message='Upload folder using huggingface_hub', commit_description='', oid='709db99f20152c6012bc4c705c63a71b9dd2ac00', pr_url=None, repo_url=RepoUrl('https://huggingface.co/lamsytan/sentiment-analysis-product-comment', endpoint='https://huggingface.co', repo_type='model', repo_id='lamsytan/sentiment-analysis-product-comment'), pr_revision=None, pr_num=None)

<a id='pytorch_native'></a>