In [15]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer  # Changed to AutoModel
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm import tqdm
import numpy as np

# Define the custom Dataset class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.text = dataframe['text'].values
        self.labels = dataframe['Label'].values if 'Label' in dataframe.columns else None
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        encoding = self.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),  # [max_length]
            'attention_mask': encoding['attention_mask'].flatten()  # [max_length]
        }
        if self.labels is not None:
            item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Define the BertClassifier model
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("csebuetnlp/banglabert")  # Changed to AutoModel
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 7)  # 7 classes based on your labels
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        # Get model outputs with hidden_states
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True,
            output_hidden_states=True  # Enable hidden states output
        )
        # Use the [CLS] token from the last hidden state
        pooled_output = outputs.hidden_states[-1][:, 0, :]  # Shape: [batch_size, hidden_size]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

# Training function
def train(model, train_data, val_data, learning_rate, epochs):
    # Create datasets
    train_dataset = TextDataset(train_data, tokenizer)
    val_dataset = TextDataset(val_data, tokenizer)
    
    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=2)

    # Set device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        model.train()

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch_num + 1}"):
            input_ids = batch['input_ids'].to(device)  # [batch_size, max_length]
            attention_mask = batch['attention_mask'].to(device)  # [batch_size, max_length]
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            
            batch_loss = criterion(outputs, labels)
            total_loss_train += batch_loss.item()

            batch_loss.backward()
            optimizer.step()

            acc = (outputs.argmax(dim=1) == labels).sum().item()
            total_acc_train += acc

        # Validation
        total_acc_val = 0
        total_loss_val = 0
        model.eval()
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                batch_loss = criterion(outputs, labels)
                total_loss_val += batch_loss.item()
                
                acc = (outputs.argmax(dim=1) == labels).sum().item()
                total_acc_val += acc

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data):.3f} '
            f'| Train Accuracy: {total_acc_train / len(train_data):.3f} '
            f'| Val Loss: {total_loss_val / len(val_data):.3f} '
            f'| Val Accuracy: {total_acc_val / len(val_data):.3f}')

# Load data
train_df = pd.read_csv('cleaned_data.csv')
test_df = pd.read_csv('test.csv')

# Remove missing values
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Split train data into train and validation sets
train_size = int(0.8 * len(train_df))
df_train = train_df[:train_size]
df_val = train_df[train_size:]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")

# Training parameters
EPOCHS = 10  # Reduced for testing; increase to 100 for full training
LR = 1e-6

# Initialize and train the model
model = BertClassifier()
train(model, df_train, df_val, LR, EPOCHS)

  return torch.load(checkpoint_file, map_location=map_location)
Epoch 1:   8%|▊         | 125/1552 [13:41<2:36:16,  6.57s/it]


KeyboardInterrupt: 