In [11]:
import utils
from transformers import AutoTokenizer, AutoModel
import torch

# Load SpanBERT Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained('SpanBERT/spanbert-large-cased')
model = AutoModel.from_pretrained('SpanBERT/spanbert-large-cased')

annotation_dict = utils.load_data("./Data/qed-train.jsonlines")
M = annotation_dict[-3193270267191507653]
text = M.passage
question = M.question
coreference = M.aligned_nps

encoded_text = tokenizer(text, return_tensors="pt")
encoded_question = tokenizer(question, return_tensors="pt")



Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
M.selected_sent

{'start': 196,
 'end': 370,
 'string': 'She married Tucker Jones in 1995 , but their marriage encountered difficulties in 2007 , due to her commitments as both a parent to son Tuck and to her career as a surgeon . '}

In [3]:
answer_ls = M.answer

In [8]:
M.sentence_starts

[0, 100, 196, 370]

In [7]:
answer_ls[0]

Entity(start_offset=208, end_offset=220, type='context', text='Tucker Jones', normalized_text='tucker jones')

In [74]:
encoded_text['input_ids'].size()

torch.Size([1, 94])

In [75]:
encoded_question['input_ids'].size()

torch.Size([1, 12])

In [76]:
with torch.no_grad():
    text_embeddings = model(**encoded_text).last_hidden_state
    question_embeddings = model(**encoded_question).last_hidden_state

In [80]:
[coreference[0]]

[(Entity(start_offset=7, end_offset=11, type='question', text='tuck', normalized_text='tuck'),
  Entity(start_offset=328, end_offset=336, type='context', text='son Tuck', normalized_text='son tuck'))]

In [83]:
import SpanBERT
model = SpanBERT.Model(hidden_size=1024, device='cpu')
def get_encoded_span(sents, coreference, type=0): # 0 for 'question'; 1 for 'context'
    start = coreference[0][type].start_offset
    end = coreference[0][type].end_offset
    span = list(sents)[start:end]
    span = ''.join(span)
    return span
question_span = get_encoded_span(question, [coreference[0]], type=0)
encoded_question_span = model.tokenize(question_span)
encoded_question = model.tokenize(question)

Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [89]:
encoded_question

{'input_ids': tensor([[  101,  1150,  1110,   189, 21515,  1401,  1107,  5583,   112,   188,
         19768,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [90]:
encoded_question_span

{'input_ids': tensor([[ 101,  189, 8474,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [86]:
question_start, question_end = SpanBERT.find_start_end_pos(encoded_question, encoded_question_span)


In [88]:
(question_start, question_end)

(-1, -1)

In [85]:
encoded_question

{'input_ids': tensor([[  101,  1150,  1110,   189, 21515,  1401,  1107,  5583,   112,   188,
         19768,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [82]:
question_span

'tuck'

In [34]:
text_embeddings.size()

torch.Size([1, 177, 1024])

In [35]:
M = text_embeddings[0, 0, :]

In [36]:
M.size()

torch.Size([1024])

In [37]:
M.size(0)

1024

In [5]:
text_embeddings.size()

torch.Size([1, 177, 1024])

In [41]:
coreference

[(Entity(start_offset=22, end_offset=30, type='question', text='fortnite', normalized_text='fortnite'),
  Entity(start_offset=268, end_offset=276, type='context', text='Fortnite', normalized_text='fortnite'))]

In [36]:
U = torch.tensor([3916,  117, 3729,  117, 3020,  118,  118, 2889,  114,  119])
tokenizer.decode(U)

'1931, 1934, 1940 - - 1942 ).'

In [33]:
text_embeddings[:, 1:5].size()

torch.Size([1, 4, 768])

In [13]:
question

'who got the first nobel prize in physics'

In [4]:
encoded_question['input_ids']

tensor([[ 101, 1150, 1400, 1103, 1148, 1185, 8511, 4716, 1107, 7094,  102]])

In [10]:
question_start = coreference[0][0].start_offset
question_end = coreference[0][0].end_offset

In [11]:
question_span = list(question)[question_start:question_end]
question_span = ''.join(question_span)

In [14]:
encoded_question_span = tokenizer(question_span, return_tensors="pt")
print(encoded_question_span['input_ids'])

tensor([[ 101, 1103, 1148, 1185, 8511, 4716, 1107, 7094,  102]])


In [22]:
X = encoded_question['input_ids'][0, :]
Y = encoded_question_span['input_ids'][0, 1:-1]
print(X)
print(Y)

tensor([ 101, 1150, 1400, 1103, 1148, 1185, 8511, 4716, 1107, 7094,  102])
tensor([1103, 1148, 1185, 8511, 4716, 1107, 7094])


In [23]:
def find_sequence_positions_tensor(list1, list2):
    len_list2 = list2.size(0)
    for i in range(list1.size(0) - len_list2 + 1):
        if torch.equal(list1[i:i+len_list2], list2):
            return i, i + len_list2 - 1
    return -1, -1

start, end = find_sequence_positions_tensor(X, Y)
print(f"Start: {start}, End: {end}")

Start: 3, End: 9


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomQA(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.question_att = nn.Linear(hidden_size, hidden_size)
        self.context_att = nn.Linear(hidden_size, hidden_size)
        self.qa_outputs = nn.Linear(hidden_size, 2)

    def forward(self, question_hidden, context_hidden):
        # Apply linear layers
        question_att = self.question_att(question_hidden)
        context_att = self.context_att(context_hidden)

        # Compute attention weights
        attn_weights = torch.matmul(context_att, question_att.transpose(-1, -2))
        attn_weights = F.softmax(attn_weights, dim=-1)

        # Apply attention weights
        attended_question = torch.matmul(attn_weights, question_hidden)

        # Combine question and attended context
        combined_hidden = context_hidden + attended_question

        # Predicting start and end logits
        logits = self.qa_outputs(combined_hidden)
        start_logits, end_logits = logits.split(1, dim=-1)
        return start_logits.squeeze(-1), end_logits.squeeze(-1)

# Example usage
hidden_size = 768  # Example hidden size
model = CustomQA(hidden_size)

# Assuming question_hidden and context_hidden are your hidden states with different sequence lengths
question_hidden = torch.rand(10, hidden_size)  # Example tensor for question
context_hidden = torch.rand(20, hidden_size)   # Example tensor for context

start_logits, end_logits = model(question_hidden, context_hidden)
# Post-processing to find the best answer span
# ...


In [15]:
print(start_logits)

tensor([0.6141, 1.0137, 0.3670, 0.5621, 0.2945, 0.8588, 1.0265, 0.9420, 0.7214,
        0.4058, 0.6825, 0.7306, 0.5026, 0.6067, 0.4712, 1.0268, 0.7138, 0.7195,
        0.6596, 0.1748], grad_fn=<SqueezeBackward1>)


In [16]:
print(end_logits)

tensor([-0.3923, -0.3350, -0.3085, -0.4993, -0.3972, -0.6594, -0.6419, -0.8392,
        -0.1374, -0.4528, -0.1867, -0.3284, -0.5838, -0.1934, -0.3745, -0.3120,
        -0.5929, -0.1950, -0.1439, -0.4104], grad_fn=<SqueezeBackward1>)


In [17]:
torch.argmax(start_logits)

tensor(15)

In [18]:
torch.argmax(end_logits)

tensor(8)

In [17]:
m = [0, 1, 2, 3]

In [18]:
m = torch.tensor(m)

In [19]:
m.argmax().item()

3