In [57]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, BertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import time  # Add this line to import the time module

In [58]:
device = torch.device("cpu")

In [59]:
df = pd.read_csv("dataset.csv")

In [60]:
# check class distribution
df['label'].value_counts(normalize=True)

label
0    0.534188
1    0.465812
Name: proportion, dtype: float64

In [61]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [62]:
# Import BERT-base pretrained model
bert = AutoModel.from_pretrained('jcblaise/bert-tagalog-base-cased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('jcblaise/bert-tagalog-base-cased')

In [63]:
max_seq_len = 25

In [64]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length=max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length=max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [65]:
# Create tensors for train and validation sets
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

In [66]:
batch_size = 32

In [67]:
# Create data loaders
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [68]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Create the model and move it to the device
model = BERT_Arch(bert).to(device)


In [69]:
# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [70]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [71]:
criterion = nn.NLLLoss(weight=weights)

In [72]:
epochs = 10

In [75]:
def train(input_ids, attention_mask, labels):
    print("\nTraining...")
    t0 = time.time()
    total_loss = 0

    model.train()
    for step, batch in enumerate(train_dataloader):
        sent_id, mask, labels = batch
        sent_id, mask, labels = sent_id.to(device), mask.to(device), labels.to(device)  # Move labels to the same device

        optimizer.zero_grad()
        outputs = model(sent_id, mask)
        loss = cross_entropy(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        # Print progress every 50 batches
        if step % 50 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step} of {len(train_dataloader)}, Elapsed: {elapsed}')

    # Calculate the average training loss
    avg_train_loss = total_loss / len(train_dataloader)
    return avg_train_loss



train_losses = []
valid_losses = []

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    train_loss = train(train_seq, train_mask, train_y)
    valid_loss, _ = evaluate()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')



 Epoch 1 / 10

Training...


TypeError: linear(): argument 'input' (position 1) must be Tensor, not str

In [None]:
 model.eval()
    total_preds = []

In [None]:
with torch.no_grad():
        total_loss = 0

        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = outputs.cpu().argmax(dim=1).numpy()
            total_preds.extend(preds)

    avg_val_loss = total_loss / len(val_dataloader)
    print(f'Validation Loss: {avg_val_loss:.4f}')