# Bengali model


In [2]:
from dataset import FFN2Dataset
from tokenizer import FFN2Tokenizer
from torch.optim import Adam
from datasets import load_dataset
import torch
from models import MeanClassifier, CLSClassifier, LogRegCLSClassifier
from train import train, evaluate
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR

bert_map = {
    'bengali': 'google/muril-base-cased', 
    'english': 'bert-base-uncased', 
    'indonesian': 'cahya/bert-base-indonesian-522M', 
    'arabic': 'asafaya/bert-base-arabic'
}
language = "bengali"
languages = ["bengali", "indonesian", "arabic"]
bert = bert_map[language]
device = 'cuda'
input_dim = 768
hidden_dim = 50
lr = 3e-2
batch_size = 32
epochs = 3

dataset = load_dataset("copenlu/answerable_tydiqa")
language_dataset = dataset.filter(lambda row: row['language'] == language)

train_set = language_dataset["train"]
validation_set = language_dataset["validation"]

tokenizer = FFN2Tokenizer(bert)
train_set = FFN2Dataset(train_set, tokenizer)
validation_set = FFN2Dataset(validation_set, tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


## Mean FFN

In [4]:
mean_model = MeanClassifier(bert)
optimizer = Adam(mean_model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_set)*epochs, cycle_momentum=False)
best_model = train(mean_model, optimizer, scheduler, train_set, validation_set, epochs=epochs, batch_size=batch_size, lr=lr, device=device)
torch.save(best_model, 'mean_bert_classifier.pt')

Epoch 1/3: 100%|██████████| 150/150 [03:13<00:00,  1.29s/it]


Epoch 1/3, Loss: 0.6743885838985443


100%|██████████| 7/7 [00:11<00:00,  1.66s/it]


Loss: 0.0, Accuracy: 67.41%


Epoch 2/3:  19%|█▉        | 29/150 [00:48<03:23,  1.69s/it]


KeyboardInterrupt: 

## CLS FFN

In [None]:
cls_model = CLSClassifier(bert)
optimizer = Adam(cls_model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_set)*epochs, cycle_momentum=False)
best_model = train(cls_model, optimizer, scheduler, train_set, validation_set, epochs=epochs, batch_size=batch_size, lr=lr, device=device)
torch.save(best_model, 'cls_bert_classifier.pt')

Epoch 1/5: 100%|██████████| 150/150 [03:23<00:00,  1.36s/it]


Epoch 1/5, Loss: 0.6944307072957356


100%|██████████| 7/7 [00:09<00:00,  1.31s/it]


Loss: 0.0, Accuracy: 50.00%


Epoch 2/5: 100%|██████████| 150/150 [03:24<00:00,  1.36s/it]


Epoch 2/5, Loss: 0.6949388960997264


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 50.00%


Epoch 3/5: 100%|██████████| 150/150 [03:25<00:00,  1.37s/it]


Epoch 3/5, Loss: 0.6928549643357594


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 50.00%


Epoch 4/5: 100%|██████████| 150/150 [03:25<00:00,  1.37s/it]


Epoch 4/5, Loss: 0.6941517889499664


100%|██████████| 7/7 [00:09<00:00,  1.31s/it]


Loss: 0.0, Accuracy: 50.00%


Epoch 5/5: 100%|██████████| 150/150 [03:24<00:00,  1.37s/it]


Epoch 5/5, Loss: 0.6939051934083302


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 50.00%


## CLS Log Reg

In [None]:
cls_model = LogRegCLSClassifier(bert)
optimizer = Adam(cls_model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_set)*epochs, cycle_momentum=False)
best_model = train(cls_model, optimizer, scheduler, train_set, validation_set, epochs=epochs, batch_size=batch_size, lr=lr, device=device)
torch.save(best_model, 'cls_bert_logreg_classifier.pt')

Epoch 1/5: 100%|██████████| 150/150 [03:24<00:00,  1.36s/it]


Epoch 1/5, Loss: 0.6881467231114705


100%|██████████| 7/7 [00:09<00:00,  1.31s/it]


Loss: 0.0, Accuracy: 65.18%


Epoch 2/5: 100%|██████████| 150/150 [03:24<00:00,  1.37s/it]


Epoch 2/5, Loss: 0.6739896603425344


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 64.73%


Epoch 3/5: 100%|██████████| 150/150 [03:25<00:00,  1.37s/it]


Epoch 3/5, Loss: 0.6637167219320933


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 70.09%


Epoch 4/5: 100%|██████████| 150/150 [03:24<00:00,  1.37s/it]


Epoch 4/5, Loss: 0.6536384157339732


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 65.18%


Epoch 5/5: 100%|██████████| 150/150 [03:24<00:00,  1.37s/it]


Epoch 5/5, Loss: 0.6462677776813507


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


Loss: 0.0, Accuracy: 57.59%


# Zero-shot Bengali classification model 


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, AutoModelForQuestionAnswering
from tqdm import tqdm
import pandas as pd
from datasets import load_dataset

# 1. Create a custom dataset
class AnswerabilityDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=300):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        inputs = self.tokenizer(row['question_text'], row['document_plaintext'], truncation='only_second', padding='max_length', stride=50, max_length=self.max_length, return_tensors='pt')
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        inputs['labels'] = torch.tensor(self.data[idx]["annotations"]["answer_start"][0] != -1).long()
        return inputs
    
device = 'cpu'
languages = ["bengali", "indonesian", "arabic"]
dataset = load_dataset("copenlu/answerable_tydiqa")
for language in languages:
    print("Training", language)
    language_dataset = dataset.filter(lambda row: row['language'] == language)

    train_set = language_dataset["train"]
    validation_set = language_dataset["validation"]

    # 3. Initialize tokenizer and datasets
    tokenizer = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base')
    train_dataset = AnswerabilityDataset(tokenizer, train_set)
    val_dataset = AnswerabilityDataset(tokenizer, validation_set)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # 4. Load model and send to device
    model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base').to(device)

    # 5. Training loop
    optimizer = AdamW(model.parameters(), lr=2e-5)

    num_epochs = 3
    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss/len(train_loader)}")

        # Validation
        model.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader):
                inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
                labels = batch['labels'].to(device)
                outputs = model(**inputs, labels=labels)
                total_eval_loss += outputs.loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs} | Val Loss: {total_eval_loss/len(val_loader)}")

    # 6. Save model
    torch.save(model, f'{language}_xlm-roberta-base_cls_logreg_classifier.pt')


    
# dataset = load_dataset("copenlu/answerable_tydiqa")
# for language in languages:
#     print("Training", language)
#     language_dataset = dataset.filter(lambda row: row['language'] == language)

#     train_set = language_dataset["train"]
#     validation_set = language_dataset["validation"]
        
#     bert = 'xlm-roberta-base'
#     tokenizer = AutoTokenizer.from_pretrained(bert, max_len=300)
#     train_set = AnswerabilityDataset(tokenizer, train_set)
#     validation_set = AnswerabilityDataset(validation_set, tokenizer)
    
#     model = XLMRobertaForSequenceClassification.from_pretrained(bert).to(device)
#     optimizer = Adam(model.parameters(), lr=lr)
#     scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_set)*epochs, cycle_momentum=False)
#     train(model, optimizer, scheduler, train_set, validation_set, epochs=epochs, batch_size=batch_size, lr=lr, device=device)
#     torch.save(model, f'{language}_xlm-roberta-base_cls_logreg_classifier.pt')
    

Training bengali


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/1195 [00:00<?, ?it/s]


TypeError: forward() got an unexpected keyword argument 'truncation'

In [3]:
dataset = load_dataset("copenlu/answerable_tydiqa")
accuracies = {
    "indonesian": {"indonesian": 0, "bengali": 0, "arabic": 0},
    "bengali": {"indonesian": 0, "bengali": 0, "arabic": 0},
    "arabic": {"indonesian": 0, "bengali": 0, "arabic": 0},
}
import torch_directml
device = 'cpu' # torch_directml.device()
for language in languages:
    model = torch.load(f"{language}_cls_bert_logreg_classifier.pt", map_location=torch.device(device)).to(device)
    tokenizer = FFN2Tokenizer(bert_map[language])
    dataset_language = dataset.filter(lambda row: row["language"] == language)[
        "validation"
    ]
    validation_set = FFN2Dataset(dataset_language, tokenizer)

    print(f"Evaluating {language} bert for", language)
    accuracy = evaluate(model, validation_set, batch_size=batch_size, device=device)
    accuracies[language][language] = accuracy

Evaluating bengali bert for bengali


100%|██████████| 7/7 [03:03<00:00, 26.22s/it]


Loss: 0.0, Accuracy: 63.39%


Filter: 100%|██████████| 116067/116067 [00:01<00:00, 76433.28 examples/s]
Filter: 100%|██████████| 13325/13325 [00:00<00:00, 83174.25 examples/s]


Evaluating indonesian bert for indonesian


100%|██████████| 38/38 [16:07<00:00, 25.45s/it]


Loss: 0.0, Accuracy: 67.93%
Evaluating arabic bert for arabic


100%|██████████| 60/60 [25:59<00:00, 25.99s/it]

Loss: 0.0, Accuracy: 83.23%





In [1]:
# Pretty print accuracies
print("Accuracies")
print(" ".join(languages))
for language, d1 in accuracies.items():
    print(language, " ".join([str(accuracies[language][language])]))

Accuracies


NameError: name 'languages' is not defined