In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Load the pre-trained Tagalog BERT model and tokenizer
model_name = "jcblaise/bert-tagalog-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

# Define a custom classifier on top of BERT
class SpamClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(SpamClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Load your spam/ham dataset (e.g., from a CSV file)
data = pd.read_csv("dataset.csv")  # Make sure your CSV file has "text" and "label" columns

# Tokenize and encode your dataset
encoded_data = tokenizer(list(data["text"]), truncation=True, padding=True, return_tensors="pt")
input_ids = encoded_data.input_ids
attention_mask = encoded_data.attention_mask

# Convert labels to PyTorch tensors
labels = torch.tensor(data["label"].values)

# Split the dataset into train and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)

train_attention_mask, val_attention_mask, _, _ = train_test_split(
    attention_mask, labels, test_size=0.2, random_state=42
)

# Create DataLoader for training and validation data
train_dataset = TensorDataset(train_inputs, train_attention_mask, train_labels)
val_dataset = TensorDataset(val_inputs, val_attention_mask, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

# Define your SpamClassifier model
num_classes = 2  # 2 classes: spam and not spam
spam_classifier = SpamClassifier(bert_model, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(spam_classifier.parameters(), lr=2e-5)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    spam_classifier.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = spam_classifier(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
spam_classifier.eval()
predicted_labels = []
true_labels = []

for batch in val_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = spam_classifier(input_ids, attention_mask)
    _, predicted = torch.max(outputs, 1)
    predicted_labels.extend(predicted.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = np.mean(np.array(predicted_labels) == np.array(true_labels))
print("Validation Accuracy:", accuracy)


Validation Accuracy: 1.0
