### **Cài đặt những thư viện cần thiết**

In [None]:
!pip install langchain langchain-community langchain-huggingface chromadb pymupdf transformers accelerate torch datasets
!pip install --upgrade transformers

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json
from huggingface_hub import login
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import os


KeyboardInterrupt: 

### **Logging vào Hugging**

In [None]:
login("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")

### **Chuyển file thành dạng json**

In [None]:

documents = []
with open("/content/train.txt", "r", encoding = "utf-8") as f:
  for line in f :
    documents.append(line)


with open("/content/train.jsonl", "w", encoding="utf-8") as f:
    for line in documents:
        sentence, label = line.strip().split(";")  # Tách câu và nhãn cảm xúc
        json.dump({"text": sentence, "label": label}, f)
        f.write("\n")



### **Load LLM**

In [None]:

model_name = "distilbert-base-uncased"  # Hoặc "bert-base-multilingual-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

tokenizer = AutoTokenizer.from_pretrained(model_name)


### **Load dataset**

In [None]:

dataset = load_dataset("json", data_files="train.jsonl")

# Ánh xạ nhãn thành số
label_mapping = {"sadness": 0, "joy": 1, "anger": 2, "love" : 3, "surprise" : 4, "fear" : 5}

def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    tokens["label"] = [label_mapping[label] for label in examples["label"]]  # Chuyển từng nhãn thành số
    return tokens

tokenized_datasets = dataset.map(tokenize_function, batched=True)



### **TRAIN TIME!!!**

In [None]:
os.environ["WANDB_DISABLED"] = "true"
training_args = TrainingArguments(
    output_dir="./fine_tuned_sentiment",
    evaluation_strategy="no",  # Không cần tập validation
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_dir="./logs",
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("./fine_tuned_sentiment")


### **Kiểm tra**

In [None]:
# Ánh xạ số về tên nhãn
label_mapping = {0: "sadness", 1: "joy", 2: "anger", 3: "love", 4: "surprise", 5: "fear"}

# Tạo pipeline
from transformers import pipeline

classifier = pipeline("text-classification", model="./fine_tuned_sentiment", tokenizer=tokenizer)

true_answers = 0
questions = 0
with open("/content/test.txt", "r", encoding = "utf-8") as f:
  for line in f :
    test_sentence, emotion = line.strip().split(';')
    result = classifier(test_sentence)[0]  # Lấy kết quả đầu tiên
    result["label"] = label_mapping[int(result["label"].split("_")[-1])]
    if result["label"] == emotion : true_answers += 1
    questions += 1

print(true_answers, ' ', questions)

In [None]:
# Ánh xạ số về tên nhãn
label_mapping = {0: "sadness", 1: "joy", 2: "anger", 3: "love", 4: "surprise", 5: "fear"}

# Tạo pipeline
from transformers import pipeline

classifier = pipeline("text-classification", model="distilbert-base-uncased", tokenizer=tokenizer)

true_answers = 0
questions = 0
with open("/content/test.txt", "r", encoding = "utf-8") as f:
  for line in f :
    test_sentence, emotion = line.strip().split(';')
    result = classifier(test_sentence)[0]  # Lấy kết quả đầu tiên
    result["label"] = label_mapping[int(result["label"].split("_")[-1])]
    if result["label"] == emotion : true_answers += 1
    questions += 1

print(true_answers, ' ', questions)