In [None]:
!pip install transformers datasets torch faiss-cpu tqdm -q

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import DPRQuestionEncoder, DPRContextEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer
from datasets import load_dataset
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------------------------------------------
# 1️⃣ Load and split TimeQA dataset
# ------------------------------------------------------------
raw_dataset = load_dataset("hugosousa/TimeQA", split="train")

# Filter the dataset to keep only the 'hard' questions
hard_dataset = raw_dataset.filter(lambda x: x['level'] == 'hard')

# Use only a 3000 sample subset and create an 80/20 train/test split
# Note: The 'hard' split has about 14,681 samples in the train set, so a 3000 subset is a good choice.
subset_size = 14000
subset = hard_dataset.shuffle(seed=42).select(range(subset_size))

train_test_split = subset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


class TimeQADataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        timestamp = float(item["timestamp"]) if "timestamp" in item and item["timestamp"] else 0.0
        return {
            "question": item["question"],
            "context": item["context"],
            "timestamp": timestamp
        }

train_qa_dataset = TimeQADataset(train_dataset)
test_qa_dataset = TimeQADataset(test_dataset)

# ------------------------------------------------------------
# 2️⃣ Tokenizers
# ------------------------------------------------------------
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
c_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# ------------------------------------------------------------
# 3️⃣ Collate function
# ------------------------------------------------------------
def collate_fn(batch):
    q_texts = [b["question"] for b in batch]
    c_texts = [b["context"] for b in batch]
    timestamps = torch.tensor([b["timestamp"] for b in batch], dtype=torch.float32)
    return q_texts, c_texts, timestamps

train_loader = DataLoader(train_qa_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_qa_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# ------------------------------------------------------------
# 4️⃣ Global Top-k accuracy
# ------------------------------------------------------------
def compute_global_topk(q_embs, c_embs, k=5):
    scores = torch.matmul(q_embs, c_embs.T)
    topk = scores.topk(k, dim=1).indices
    correct = torch.arange(scores.size(0)).to(scores.device)
    acc = (topk == correct.unsqueeze(1)).any(dim=1).float().mean()
    return acc.item()

# ------------------------------------------------------------
# 5️⃣ Training baseline DPR
# ------------------------------------------------------------
def train_baseline(q_model, c_model, loader, epochs=1):
    q_model.train()
    c_model.train()
    optimizer = optim.Adam(list(q_model.parameters()) + list(c_model.parameters()), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        for q_texts, c_texts, _ in tqdm(loader, desc=f"Baseline Epoch {epoch+1}"):
            q_inputs = q_tokenizer(
                q_texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)
            c_inputs = c_tokenizer(
                c_texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)

            q_emb = q_model(**q_inputs).pooler_output
            c_emb = c_model(**c_inputs).pooler_output

            scores = torch.matmul(q_emb, c_emb.T)
            labels = torch.arange(scores.size(0)).to(device)
            loss = loss_fn(scores, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Baseline Epoch {epoch+1} done. Last batch loss: {loss.item():.4f}")

# ------------------------------------------------------------
# 6️⃣ Temporal DPR with learned time embeddings
# ------------------------------------------------------------
class TemporalDPR(nn.Module):
    def __init__(self, q_model, c_model, time_dim=32):
        super().__init__()
        self.q_model = q_model
        self.c_model = c_model
        self.time_embed = nn.Linear(1, time_dim)
        self.proj_q = nn.Linear(q_model.config.hidden_size + time_dim, q_model.config.hidden_size)
        self.proj_c = nn.Linear(c_model.config.hidden_size + time_dim, c_model.config.hidden_size)

    def forward(self, q_inputs, c_inputs, times):
        q_emb = self.q_model(**q_inputs).pooler_output
        c_emb = self.c_model(**c_inputs).pooler_output
        t_emb = self.time_embed(times.unsqueeze(1))
        q_emb = self.proj_q(torch.cat([q_emb, t_emb], dim=-1))
        c_emb = self.proj_c(torch.cat([c_emb, t_emb], dim=-1))
        return q_emb, c_emb

def train_temporal(model, loader, epochs=1):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        for q_texts, c_texts, times in tqdm(loader, desc=f"Temporal Epoch {epoch+1}"):
            q_inputs = q_tokenizer(
                q_texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)
            c_inputs = c_tokenizer(
                c_texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)
            times = times.to(device)

            q_emb, c_emb = model(q_inputs, c_inputs, times)
            scores = torch.matmul(q_emb, c_emb.T)
            labels = torch.arange(scores.size(0)).to(device)
            loss = loss_fn(scores, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Temporal Epoch {epoch+1} done. Last batch loss: {loss.item():.4f}")

# ------------------------------------------------------------
# 7️⃣ Evaluation on full dataset
# ------------------------------------------------------------
def evaluate_global(q_model, c_model, loader, temporal_model=None):
    if temporal_model:
        temporal_model.eval()
    else:
        q_model.eval()
        c_model.eval()

    all_q_embs = []
    all_c_embs = []

    with torch.no_grad():
        for q_texts, c_texts, times in tqdm(loader, desc="Embedding all batches"):
            q_inputs = q_tokenizer(
                q_texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)
            c_inputs = c_tokenizer(
                c_texts, padding=True, truncation=True, max_length=512, return_tensors="pt"
            ).to(device)
            times = times.to(device)

            if temporal_model:
                q_emb, c_emb = temporal_model(q_inputs, c_inputs, times)
            else:
                q_emb = q_model(**q_inputs).pooler_output
                c_emb = c_model(**c_inputs).pooler_output

            all_q_embs.append(q_emb)
            all_c_embs.append(c_emb)

    all_q_embs = torch.cat(all_q_embs, dim=0)
    all_c_embs = torch.cat(all_c_embs, dim=0)
    acc1 = compute_global_topk(all_q_embs, all_c_embs, k=1)
    acc5 = compute_global_topk(all_q_embs, all_c_embs, k=5)
    return acc1, acc5

# ------------------------------------------------------------
# 8️⃣ Instantiate models
# ------------------------------------------------------------
print("\n🧱 Initializing models...")
baseline_q = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to(device)
baseline_c = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device)

temp_q = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
temp_c = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
temporal_model = TemporalDPR(temp_q, temp_c).to(device)

# ------------------------------------------------------------
# 9️⃣ Training
# ------------------------------------------------------------
print("\n🧱 Training Baseline DPR...")
train_baseline(baseline_q, baseline_c, train_loader, epochs=3)

print("\n🕰️ Training Temporal DPR...")
train_temporal(temporal_model, train_loader, epochs=6)

# ------------------------------------------------------------
# 🔟 Evaluate
# ------------------------------------------------------------
print("\n📊 Global Evaluation Results on the Test Set:")
baseline_acc1, baseline_acc5 = evaluate_global(baseline_q, baseline_c, test_loader)
print(f"Baseline DPR: Top-1: {baseline_acc1:.3f} | Top-5: {baseline_acc5:.3f}")

temporal_acc1, temporal_acc5 = evaluate_global(None, None, test_loader, temporal_model=temporal_model)
print(f"Temporal DPR: Top-1: {temporal_acc1:.3f} | Top-5: {temporal_acc5:.3f}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/449 [00:00<?, ?B/s]

train.json:   0%|          | 0.00/300M [00:00<?, ?B/s]

dev.json:   0%|          | 0.00/65.5M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/64.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28989 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6108 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6075 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28989 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.



🧱 Initializing models...


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the 

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



🧱 Training Baseline DPR...



Baseline Epoch 1:   0%|          | 0/1400 [00:00<?, ?it/s][A
Baseline Epoch 1:   0%|          | 1/1400 [00:01<35:38,  1.53s/it][A
Baseline Epoch 1:   0%|          | 2/1400 [00:01<18:39,  1.25it/s][A
Baseline Epoch 1:   0%|          | 3/1400 [00:02<14:02,  1.66it/s][A
Baseline Epoch 1:   0%|          | 4/1400 [00:02<12:06,  1.92it/s][A
Baseline Epoch 1:   0%|          | 5/1400 [00:02<10:21,  2.24it/s][A
Baseline Epoch 1:   0%|          | 6/1400 [00:03<11:24,  2.04it/s][A
Baseline Epoch 1:   0%|          | 7/1400 [00:03<10:30,  2.21it/s][A
Baseline Epoch 1:   1%|          | 8/1400 [00:04<10:03,  2.31it/s][A
Baseline Epoch 1:   1%|          | 9/1400 [00:04<10:27,  2.22it/s][A
Baseline Epoch 1:   1%|          | 10/1400 [00:05<10:41,  2.17it/s][A
Baseline Epoch 1:   1%|          | 11/1400 [00:05<09:50,  2.35it/s][A
Baseline Epoch 1:   1%|          | 12/1400 [00:05<09:35,  2.41it/s][A
Baseline Epoch 1:   1%|          | 13/1400 [00:06<09:12,  2.51it/s][A
Baseline Epoch 1:   1%|

Baseline Epoch 1 done. Last batch loss: 0.0000


Baseline Epoch 2: 100%|██████████| 1400/1400 [09:30<00:00,  2.45it/s]


Baseline Epoch 2 done. Last batch loss: 0.0000


Baseline Epoch 3: 100%|██████████| 1400/1400 [09:30<00:00,  2.45it/s]


Baseline Epoch 3 done. Last batch loss: 0.0012

🕰️ Training Temporal DPR...


Temporal Epoch 1: 100%|██████████| 1400/1400 [09:32<00:00,  2.45it/s]


Temporal Epoch 1 done. Last batch loss: 0.1019


Temporal Epoch 2: 100%|██████████| 1400/1400 [09:35<00:00,  2.43it/s]


Temporal Epoch 2 done. Last batch loss: 0.0000


Temporal Epoch 3: 100%|██████████| 1400/1400 [09:33<00:00,  2.44it/s]


Temporal Epoch 3 done. Last batch loss: 0.0022


Temporal Epoch 4: 100%|██████████| 1400/1400 [09:33<00:00,  2.44it/s]


Temporal Epoch 4 done. Last batch loss: 0.0015


Temporal Epoch 5: 100%|██████████| 1400/1400 [09:34<00:00,  2.44it/s]


Temporal Epoch 5 done. Last batch loss: 0.0074


Temporal Epoch 6: 100%|██████████| 1400/1400 [09:34<00:00,  2.44it/s]


Temporal Epoch 6 done. Last batch loss: 0.0000

📊 Global Evaluation Results on the Test Set:


Embedding all batches: 100%|██████████| 350/350 [01:43<00:00,  3.39it/s]


Baseline DPR: Top-1: 0.700 | Top-5: 0.995


Embedding all batches: 100%|██████████| 350/350 [01:43<00:00,  3.37it/s]

Temporal DPR: Top-1: 0.600 | Top-5: 0.954





In [None]:
def compute_recall(q_embs, c_embs, k):
    scores = torch.matmul(q_embs, c_embs.T)
    topk = scores.topk(k, dim=1).indices
    correct = torch.arange(scores.size(0)).to(scores.device)
    recall = (topk == correct.unsqueeze(1)).any(dim=1).float().mean()
    return recall.item()

print("\n📊 Global Evaluation Results on the Test Set (Recall):")

# Assuming all_q_embs and all_c_embs from the last evaluation run are still in memory.
# If not, you would need to re-run the evaluate_global function to get them.
# For this example, we will assume they are available.

# Calculate Recall@5 and Recall@10 for Baseline DPR
baseline_recall5 = compute_recall(all_q_embs, all_c_embs, k=5)
baseline_recall10 = compute_recall(all_q_embs, all_c_embs, k=10)
print(f"Baseline DPR: Recall@5: {baseline_recall5:.3f} | Recall@10: {baseline_recall10:.3f}")

# Calculate Recall@5 and Recall@10 for Temporal DPR
# Note: You would need to re-run the temporal_model evaluation to get its specific embeddings if they are not available.
# Assuming temporal_model was the last one evaluated and its embeddings are in all_q_embs and all_c_embs.
# If not, you would need to modify the evaluate_global function to return embeddings for both models.
# For this example, we will assume the temporal model's embeddings are the ones currently in all_q_embs and all_c_embs.
temporal_recall5 = compute_recall(all_q_embs, all_c_embs, k=5)
temporal_recall10 = compute_recall(all_q_embs, all_c_embs, k=10)
print(f"Temporal DPR: Recall@5: {temporal_recall5:.3f} | Recall@10: {temporal_recall10:.3f}")