In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import time

In [3]:
# Load the pre-trained Tagalog BERT model and tokenizer
model_name = "jcblaise/bert-tagalog-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

In [4]:
# Define a custom classifier on top of BERT
class SpamClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(SpamClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [5]:
# Load the dataset
data = pd.read_csv("dataset.csv")  # Make sure your CSV file has "text" and "label" columns

In [6]:
data.head()

Unnamed: 0,label,text
0,1,"#1 Sa aming oferta, makakatanggap ka ng 100% l..."
1,1,"Mga kasamahan, ang iyong 777 ay maaari nang i-..."
2,1,Congratulations! Ikaw ay isa sa mga napiling 9...
3,1,Gusto mo bang kumita ng 100% pa? Huwag palampa...
4,1,Eksklusibong alok para sa iyo: Libreng iPhone ...


In [11]:
# Tokenize and encode your dataset
encoded_data = tokenizer(list(data["text"]), truncation=True, padding=True, return_tensors="pt")
input_ids = encoded_data.input_ids
attention_mask = encoded_data.attention_mask

In [12]:
# Convert labels to PyTorch tensors
labels = torch.tensor(data["label"].values)

In [13]:
# Split the dataset into train and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)

train_attention_mask, val_attention_mask, _, _ = train_test_split(
    attention_mask, labels, test_size=0.2, random_state=42
)

In [14]:
# Create DataLoader for training and validation data
train_dataset = TensorDataset(train_inputs, train_attention_mask, train_labels)
val_dataset = TensorDataset(val_inputs, val_attention_mask, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [15]:
# Define your SpamClassifier model
num_classes = 2  # 2 classes: spam and not spam
spam_classifier = SpamClassifier(bert_model, num_classes)

In [16]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(spam_classifier.parameters(), lr=2e-5)

In [17]:
# Training loop
num_epochs = 6

In [18]:
total_training_time = 0

In [19]:
for epoch in range(num_epochs):
    print(f"Starting Epoch {epoch + 1}")
    spam_classifier.train()
    
    # Initialize a variable to record the start time for the current epoch
    epoch_start_time = time.time()
    
    total_loss = 0  # Track the total loss for the epoch
    for batch_idx, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = spam_classifier(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Print training progress
        if (batch_idx + 1) % 100 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_idx + 1}/{len(train_dataloader)}], Loss: {loss.item()}")

 # Calculate and print the average loss for the epoch
    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}] - Average Loss: {average_loss}")
    
    # Calculate and print the training time for the epoch
    epoch_end_time = time.time()
    epoch_training_time = epoch_end_time - epoch_start_time
    total_training_time += epoch_training_time
    print(f"Epoch [{epoch + 1}/{num_epochs}] - Training Time: {epoch_training_time:.2f} seconds")

Starting Epoch 1
Epoch [1/6] - Average Loss: 0.32014664020785627
Epoch [1/6] - Training Time: 445.15 seconds
Starting Epoch 2
Epoch [2/6] - Average Loss: 0.009993328975434316
Epoch [2/6] - Training Time: 354.86 seconds
Starting Epoch 3
Epoch [3/6] - Average Loss: 0.002329936580474865
Epoch [3/6] - Training Time: 400.56 seconds
Starting Epoch 4
Epoch [4/6] - Average Loss: 0.001452005052186073
Epoch [4/6] - Training Time: 391.65 seconds
Starting Epoch 5
Epoch [5/6] - Average Loss: 0.0009139720115196356
Epoch [5/6] - Training Time: 397.28 seconds
Starting Epoch 6
Epoch [6/6] - Average Loss: 0.0006773410900000562
Epoch [6/6] - Training Time: 389.31 seconds


In [20]:
# Print the total training time
print(f"Total Training Time: {total_training_time:.2f} seconds")

Total Training Time: 2378.81 seconds


In [21]:
# Evaluation
spam_classifier.eval()
predicted_labels = []
true_labels = []

for batch in val_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = spam_classifier(input_ids, attention_mask)
    _, predicted = torch.max(outputs, 1)
    predicted_labels.extend(predicted.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

In [22]:
# Calculate metrics: accuracy, recall, precision, f1-score
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

In [23]:
print("Validation Accuracy:", accuracy)
print("Validation Recall:", recall)
print("Validation Precision:", precision)
print("Validation F1-score:", f1)

Validation Accuracy: 1.0
Validation Recall: 1.0
Validation Precision: 1.0
Validation F1-score: 1.0


In [24]:
# Calculate and print the confusion matrix
confusion = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[51  0]
 [ 0 43]]


In [25]:
# Calculate and print the classification report
report = classification_report(true_labels, predicted_labels)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       1.00      1.00      1.00        43

    accuracy                           1.00        94
   macro avg       1.00      1.00      1.00        94
weighted avg       1.00      1.00      1.00        94



In [26]:
# Define a function for making predictions
def predict_spam(input_text, model, tokenizer):
    # Tokenize and encode the input text
    encoded_text = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoded_text.input_ids
    attention_mask = encoded_text.attention_mask

    # Make predictions using the model
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
    _, predicted = torch.max(outputs, 1)

    # Decode the predicted label (0 for not spam, 1 for spam)
    if predicted.item() == 0:
        prediction = "Ham"
    else:
        prediction = "Spam"

    return prediction

In [28]:
# Save the model weights
torch.save(spam_classifier.state_dict(), "spam_classifier_weights.pth")

# Save the tokenizer
tokenizer.save_pretrained("tokenizer_directory")

('tokenizer_directory\\tokenizer_config.json',
 'tokenizer_directory\\special_tokens_map.json',
 'tokenizer_directory\\vocab.txt',
 'tokenizer_directory\\added_tokens.json',
 'tokenizer_directory\\tokenizer.json')

In [42]:
input_text = "where did you get that? is it from dog site wwww.cahah.com"
predicted_label = predict_spam(input_text, spam_classifier, tokenizer)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Spam


In [56]:
input_text = "san mo nakuha yan? galing ba sa dog site wwww.cahah.com"
predicted_label = predict_spam(input_text, spam_classifier, tokenizer)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Ham


In [58]:
input_text = "where did you get that? open this wwww.cahah.com"
predicted_label = predict_spam(input_text, spam_classifier, tokenizer)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Ham


In [1]:
input_text = "san mo nakuha yan? buksan mo to wwww.cahah.com, salamat, punta ko dyan mamaya"
predicted_label = predict_spam(input_text, spam_classifier, tokenizer)
print(f"Predicted Label: {predicted_label}")

NameError: name 'predict_spam' is not defined