In [86]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.optim import AdamW
from tqdm import tqdm
import os

In [88]:
# Fix symlink warning for Windows users
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load ALBERT model for Question Answering
model_name = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
# Define dataset (Ensure correct formatting)
data = [
    {"context": "My name is Ramzan.", "question": "What is my name?", "answer": "Ramzan", "start": 11, "end": 17},
    {"context": "I live in Lahore.", "question": "Where do you live?", "answer": "Lahore", "start": 10, "end": 16},
    {"context": "My occupation is Engineer.", "question": "What is your occupation?", "answer": "Engineer", "start": 17, "end": 25},
    {"context": "I work at ML1.", "question": "Where do you work?", "answer": "ML1", "start": 10, "end": 13},
    {"context": "I study at UCP.", "question": "Where do you study?", "answer": "UCP", "start": 11, "end": 14},
]

In [92]:
# Define Dataset Class
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item['question'],
            item['context'],
            truncation="only_second",
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
            return_offsets_mapping=True  # Needed for correct token mapping
        )

        # Extract token positions correctly
        offset_mapping = inputs["offset_mapping"].squeeze()
        start_pos = next((i for i, (start, end) in enumerate(offset_mapping) if start <= item['start'] < end), 0)
        end_pos = next((i for i, (start, end) in enumerate(offset_mapping) if start < item['end'] <= end), 1)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'start_positions': torch.tensor(start_pos),
            'end_positions': torch.tensor(end_pos)
        }

In [94]:
# Prepare Dataset & DataLoader
dataset = QADataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [96]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for batch in tqdm(dataloader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_pos = batch['start_positions'].to(device)
        end_pos = batch['end_positions'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_pos,
            end_positions=end_pos
        )

        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {running_loss / len(dataloader)}")


Training Epoch 1/3: 100%|██████████| 3/3 [00:17<00:00,  5.97s/it]


Epoch 1/3 - Loss: 5.953663508097331


Training Epoch 2/3: 100%|██████████| 3/3 [00:16<00:00,  5.44s/it]


Epoch 2/3 - Loss: 2.822128931681315


Training Epoch 3/3: 100%|██████████| 3/3 [00:16<00:00,  5.53s/it]

Epoch 3/3 - Loss: 0.8069224754969279





In [98]:
# Define Inference Function
def ask_question(question, context):
    inputs = tokenizer(
        question,
        context,
        truncation="only_second",
        max_length=384,
        padding="max_length",
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)

    # Get answer span
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    # Convert tokens to answer text
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
    )
    
    return answer.strip()

In [104]:
# Test the model with a question
context = "I live in Lahore."
question = "Where do you live?"
print(f"Question: {question}")
print(f"Answer: {ask_question(question, context)}")

Question: Where do you live?
Answer: you live
