In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [19]:
df = pd.read_csv('dataset.csv')

In [None]:
df.head()

In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [None]:
df['label'].value_counts(normalize = True)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
model_name = "jcblaise/bert-tagalog-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [None]:
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)

In [None]:
class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = SpamDataset(train_encodings, train_df['label'])
test_dataset = SpamDataset(test_encodings, test_df['label'])

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./results',
    evaluation_strategy="steps",
    num_train_epochs=3,
    save_total_limit=2,
    save_steps=500,
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
)

In [None]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds),
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()
results = trainer.evaluate(test_dataset)