In [None]:
!pip install -q simpletransformers

In [None]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
from datasets import load_dataset

ds = load_dataset("pythainlp/wisesight_sentiment")

In [None]:
ds

In [None]:
import pandas as pd
data = {
    "label": [0, 1, 2, 3],
    "description": ["pos", "neu", "neg", "question"]
}

display(pd.DataFrame(data))

In [None]:
import pandas as pd

train_df = pd.DataFrame({
    'text': ds['train']['texts'], 
    'labels': ds['train']['category']  
})

eval_df = pd.DataFrame({
    'text': ds['test']['texts'],
    'labels': ds['test']['category']
})


In [None]:
train_df.head(10)

In [None]:
train_df = train_df[train_df['labels'] != 3]
eval_df = eval_df[eval_df['labels'] != 3]

# Word tokenization

In [None]:
!pip install -qq pythainlp

In [None]:
!pip install -q deepcut

In [None]:
import re
from pythainlp import word_tokenize

def th_words_tokenize(sentence):
    pattern = r"[^ก-๙a-zA-Z0-9\s\?\.\;\:\!\"ๆฯ]+"
    
    sentence = re.sub(pattern, '', sentence)
    
    sentence = sentence.replace('เเ', 'แ')
    
    tokens = word_tokenize(sentence, engine="longest")
    
    processed_tokens = []
    for i, token in enumerate(tokens):
        if token == 'ๆ':
            for j in range(i-1, -1, -1):
                if tokens[j].strip():  
                    processed_tokens.append(tokens[j])  
                    break
        else:
            processed_tokens.append(token)
    
    return " ".join(processed_tokens)


In [None]:
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(th_words_tokenize)
eval_df['text'] = eval_df['text'].progress_apply(th_words_tokenize)

In [None]:
import numpy as np
num_labels = len(np.unique(train_df['labels']))
print(num_labels)

In [None]:
import plotly.graph_objects as go

def plot_histogram_labels(df):
    fig = go.Figure(data=[go.Histogram(x=df['labels'])])
    fig.update_layout(
        title="Distribution of Labels",
        xaxis_title="Labels",
        yaxis_title="Count",
        bargap=0.2,  
    )
    fig.show()

plot_histogram_labels(train_df)

# Fine tune

In [None]:
from transformers import AutoModel
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
model = AutoModel.from_pretrained(model_name)
print(type(model))

In [None]:
from transformers import AutoTokenizer
from simpletransformers.classification import ClassificationModel
import torch

model_name = "airesearch/wangchanberta-base-att-spm-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up model arguments
model_args = {
    "sliding_window": True,
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "num_train_epochs": 10, 
    "train_batch_size": 16,  
    "eval_batch_size": 32,  
    "fp16": False,  
    "learning_rate": 2e-05,  
    "optimizer": "AdamW",  
    "adam_betas": (0.9, 0.999),  
    "adam_epsilon": 1e-08,  
    "lr_scheduler_type": "cosine", 
    "seed": 42, 
    "gradient_accumulation_steps": 2
}

model = ClassificationModel(
    "camembert", 
    model_name,
    num_labels=num_labels,  
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

In [None]:
model.train_model(train_df, eval_data=eval_df)

In [None]:
test_df = pd.DataFrame({
    'text': ds['test']['texts'],  
    'labels': ds['test']['category']  
})

In [None]:
test_df.head()

In [None]:
test_df = test_df[train_df['labels'] != 3]

In [None]:
tqdm.pandas()
test_df['text'] = test_df['text'].progress_apply(th_words_tokenize)

In [None]:
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

texts = eval_df['text'].tolist()
true_labels = eval_df['labels'].tolist()

predictions, raw_outputs = model.predict(texts)

cm = confusion_matrix(true_labels, predictions)

print("Classification Report:\n", classification_report(true_labels, predictions))

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3], yticklabels=[0, 1, 2, 3])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Interence

In [None]:
input_text = """
    พนักงานบริการดีมาก สัญญาณก็ดี แต่ร้านอยู่ที่ไหน อยากได้ข้อมูลเพิ่มเติม จะได้ประกาศบนเว็บถูก
"""

In [None]:
id2label = {
    0: "pos", 
    1: "neu", 
    2: "neg",  
}

predictions, raw_outputs = model.predict([input_text])
predicted_label = id2label[predictions[0]]

print("Predicted Label (ID):", predictions[0])
print("Predicted Label (Description):", predicted_label)

# Push to huggingface

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import HfApi, create_repo, upload_folder

output_dir = "/kaggle/working/outputs"

repo_name = "Pongsathorn/wangchanberta-base-sentiment"

create_repo(repo_name, exist_ok=True)

upload_folder(
    folder_path=output_dir,   
    path_in_repo="",          
    repo_id=repo_name         
)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer2 = AutoTokenizer.from_pretrained("Pongsathorn/wangchanberta-base-sentiment")
model2 = AutoModelForSequenceClassification.from_pretrained("Pongsathorn/wangchanberta-base-sentiment")


In [None]:
import torch
import torch.nn.functional as F

id2label = {
    0: "pos", 
    1: "neu", 
    2: "neg",  
}

input_text = "พนักงานบริการดีมาก สัญญาณก็ดี แต่ร้านอยู่ที่ไหน อยากได้ข้อมูลเพิ่มเติม จะได้ประกาศบนเว็บถูก"  

inputs = tokenizer2(input_text, return_tensors="pt")

with torch.no_grad():
    outputs = model2(**inputs)
    logits = outputs.logits

probs = F.softmax(logits, dim=-1)

predicted_class = torch.argmax(probs, dim=-1).item()

predicted_label = id2label[predicted_class]

print("Predicted Label (ID):", predicted_class)
print("Predicted Label (Description):", predicted_label)