### Setup

In [15]:
import os, sys, torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from omegaconf import DictConfig, OmegaConf
from tqdm import tqdm
from hydra import compose, initialize

In [2]:
sys.path.append("../")
from src.models.model_head import HistoricalTextDatingModel, create_model_head_config
from src.utils import init_tracker, DataLoadAndFilter
import transformers

In [3]:
config_path = "../configs/defaults.yaml"
if not os.path.exists(config_path):
    raise FileNotFoundError(f"Config path does not exist: {config_path}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
with initialize(version_base=None, config_path="../configs"):
    cfg = compose(config_name="defaults")


### Load model and data

In [5]:

# Load the model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
encoder = transformers.AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-multilingual-cased")

# Load datasets using the data loader
data_loader = DataLoadAndFilter(cfg)
train_dataset, eval_dataset = data_loader.load_datasets(base_path="../")

Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Error loading schema ../data/raw/SefariaData/Sefaria-Export-master/schemas/Sheet.json: Expecting value: line 1 column 1 (char 0)
Loading Sefaria texts: 100%|██████████| 6549/6549 [00:13<00:00, 468.26it/s] 
Loading Ben Yehuda texts: 21208it [00:03, 6130.06it/s]
Loading Royal Society texts: 

### Evaluation
First we try a naive approch to complete a scentence with one token to evaluate one-shot

In [6]:
def predict_masked_word(text: str, tokenizer, model, k=5):
    # Tokenize the input
    inputs = tokenizer(text, return_tensors="pt")
    # Find all mask token positions
    mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits
    # For each mask token, get top-k predictions
    results = []
    for idx in mask_token_indices:
        mask_token_logits = predictions[0, idx, :]
        top_k_tokens = torch.topk(mask_token_logits, k, dim=0).indices.tolist()
        decoded = [tokenizer.decode([token_id]).strip() for token_id in top_k_tokens]
        print(f"Mask at position {idx.item()}: {decoded}")
        results.append(decoded)
    print("\n")
    return


In [7]:
predict_masked_word(f"This text was written between the years {tokenizer.mask_token} to {tokenizer.mask_token}", tokenizer, encoder)
predict_masked_word(f"Israel declared independence between the years {tokenizer.mask_token} and {tokenizer.mask_token}", tokenizer, encoder)
predict_masked_word(f"The United States Of America declared independence between the years {tokenizer.mask_token} and {tokenizer.mask_token}", tokenizer, encoder)
predict_masked_word(f"Barak Obama was born at {tokenizer.mask_token}", tokenizer, encoder)

Mask at position 8: ['I', '1', '.', '2', '5']
Mask at position 10: ['.', ':', 'the', '-', 'a']


Mask at position 7: ['Israel', '1948', '1', '6', '9']
Mask at position 9: ['Israel', '1948', '1947', '1946', '1967']


Mask at position 11: ['2', '1', 'independence', '1918', '3']
Mask at position 13: ['1918', '1910', 'independence', '1919', '1914']


Mask at position 7: [':', '.', 'Obama', 'Karachi', 'Kabul']




Well the results are bad (except of the Israel report to our suprise).   
Let's try to rank the possible dates

In [14]:

def acc_at_k(preds, labels, K=1):
    """
    Flexible Acc@K: correct if prediction within ±floor(K/2) of true class.
    preds: list of predicted indices (as int)
    labels: list of true indices (as int)
    K: window size
    """
    allowed_distance = K // 2
    preds = np.asarray(preds)
    labels = np.asarray(labels)
    correct = np.abs(preds - labels) <= allowed_distance
    return float(np.mean(correct))

In [8]:
def rank_candidates_by_mask_likelihood(sentence_with_mask, candidates, tokenizer, model):
    """
    Given a sentence with one mask token and a list of candidate words,
    returns the candidates sorted by model likelihood for the mask position.
    """
    # Tokenize input
    inputs = tokenizer(sentence_with_mask, return_tensors="pt")
    mask_token_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1].item()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0, mask_token_index]
    # Get scores for each candidate
    candidate_scores = []
    for word in candidates:
        token_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word)[0])
        score = logits[token_id].item()
        candidate_scores.append((word, score))
    # Sort by score descending
    return sorted(candidate_scores, key=lambda x: x[1], reverse=True)


In [9]:

sentence = f"This text was written in {tokenizer.mask_token}."
candidates = ["1800", "1900", "2000", "1500", "2025"]
ranked = rank_candidates_by_mask_likelihood(sentence, candidates, tokenizer, encoder)
print(ranked)

[('2000', 4.284444332122803), ('1900', 3.8327479362487793), ('1500', 3.5688230991363525), ('1800', 2.058945655822754), ('2025', 0.636199414730072)]


In [16]:
correct_count = 0
total_count = 0
top_preds = []
true_labels = []

for item in tqdm(train_dataset):
    text = item['text']
    year = item['comp_date']
    text = text[:500]  # Truncate to fit model input size
    sentence = f"{text}. This text was written in year {tokenizer.mask_token}."
    candidates = list(map(str, data_loader.unique_date_ranges))
    ranked = rank_candidates_by_mask_likelihood(sentence, candidates, tokenizer, encoder)
    top_choice = ranked[0][0]
    top_preds.append(candidates.index(top_choice))
    true_labels.append(candidates.index(str(year)))
    if top_choice == str(year):
        correct_count += 1
    total_count += 1

print(f"Accuracy: {correct_count}/{total_count} = {correct_count/total_count:.2%}")
print(f"Acc@1: {acc_at_k(top_preds, true_labels, K=1):.2%}")
print(f"Acc@3: {acc_at_k(top_preds, true_labels, K=3):.2%}")
print(f"Acc@5: {acc_at_k(top_preds, true_labels, K=5):.2%}")

100%|██████████| 11710/11710 [26:14<00:00,  7.44it/s]  

Accuracy: 511/11710 = 4.36%
Acc@1: 4.36%
Acc@3: 9.72%
Acc@5: 10.63%





Over train dataset:   
Accuracy: 511/11710 = 4.36%

In [12]:
100 / len(data_loader.unique_date_ranges)

0.625