In [2]:
pip install datasets transformers


Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset

ds = load_dataset("lucadiliello/newsqa")
print({split: len(ds[split]) for split in ds.keys()})

for split in ds.keys():
    print("Columns:", ds[split].column_names)
    
sample = ds["train"][0]
print("Example data: ")
for k, v in sample.items():
    if isinstance(v, str):
        print(f"{k}: {v[:200]}")
    else:
        print(f"{k}: {v}")

README.md:   0%|          | 0.00/681 [00:00<?, ?B/s]

data/train-00000-of-00001-ec54fbe500fc3b(…):   0%|          | 0.00/29.7M [00:00<?, ?B/s]

data/validation-00000-of-00001-3cf888b12(…):   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74160 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4212 [00:00<?, ? examples/s]

{'train': 74160, 'validation': 4212}
Columns: ['context', 'question', 'answers', 'key', 'labels']
Columns: ['context', 'question', 'answers', 'key', 'labels']
Example data: 
context: NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."



Monin
question: What was the amount of children murdered?
answers: ['19']
key: da0e6b66e04d439fa1ba23c32de07e50
labels: [{'end': [295], 'start': [294]}]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMQA(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, layers=2, drop=0.2):
        super(BiLSTMQA, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=layers, bidirectional=True, batch_first=True, dropout=drop)
        self.drop = nn.Dropout(drop)
        self.start_linear = nn.Linear(hidden_dim*2, 1)
        self.end_linear = nn.Linear(hidden_dim*2, 1)

    def forward(self, tokens, mask):
        x = self.embed(tokens)
        x, _ = self.lstm(x)
        x = self.drop(x)
        start = self.start_linear(x).squeeze(-1)
        end = self.end_linear(x).squeeze(-1)
        start = start.masked_fill(mask==0, -1e9)
        end = end.masked_fill(mask==0, -1e9)
        return start, end


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset

data = load_dataset("lucadiliello/newsqa")
tok = AutoTokenizer.from_pretrained("bert-base-uncased")

def prep_example(item, max_len=128):
    enc = tok(
        item['question'],
        item['context'],
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_offsets_mapping=True
    )
    offs = enc.pop("offset_mapping")
    schar = item['labels'][0]['start'][0]
    echar = item['labels'][0]['end'][0]
    enc["ids"] = enc.pop("input_ids")
    enc["mask"] = enc.pop("attention_mask")
    enc["stpos"] = 0
    enc["enpos"] = 0
    for i, (s, e) in enumerate(offs):
        if s <= schar < e:
            enc["stpos"] = i
        if s < echar <= e:
            enc["enpos"] = i
    return enc

class QAData(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        return {
            'ids': torch.tensor(ex['ids']),
            'mask': torch.tensor(ex['mask']),
            'stpos': torch.tensor(ex['stpos']),
            'enpos': torch.tensor(ex['enpos'])
        }

train_subset = data['train'].select(range(2000))
train_tok = [prep_example(x) for x in train_subset]

TrainD = QAData(train_tok)
TL = DataLoader(TrainD, batch_size=8, shuffle=True)


vsize = tok.vocab_size
model = BiLSTMQA(vsize).cuda()
CEL = nn.CrossEntropyLoss()
ADAM = optim.Adam(model.parameters(), lr=1e-3)

for ep in range(20):
    total_loss = 0
    for batch in TL:
        ids = batch['ids'].cuda()
        mask = batch['mask'].cuda()
        stpos = batch['stpos'].cuda()
        enpos = batch['enpos'].cuda()

        ADAM.zero_grad()
        st_logits, en_logits = model(ids, mask)
        loss_st = CEL(st_logits, stpos)
        loss_en = CEL(en_logits, enpos)
        loss = (loss_st + loss_en) / 2
        loss.backward()
        ADAM.step()
        total_loss += loss.item()

    print(f"Epoch {ep+1}, Loss: {total_loss/len(TL):.4f}")


Epoch 1, Loss: 3.3584
Epoch 2, Loss: 2.7257
Epoch 3, Loss: 2.2923
Epoch 4, Loss: 1.9587
Epoch 5, Loss: 1.6958
Epoch 6, Loss: 1.4459
Epoch 7, Loss: 1.2496
Epoch 8, Loss: 1.1098
Epoch 9, Loss: 1.0153
Epoch 10, Loss: 0.9453
Epoch 11, Loss: 0.8931
Epoch 12, Loss: 0.8587
Epoch 13, Loss: 0.8192
Epoch 14, Loss: 0.8031
Epoch 15, Loss: 0.7798
Epoch 16, Loss: 0.7702
Epoch 17, Loss: 0.7430
Epoch 18, Loss: 0.7118
Epoch 19, Loss: 0.6956
Epoch 20, Loss: 0.6875
