### Setup

In [11]:
import os, sys, torch
from transformers import AutoModel, AutoTokenizer
from omegaconf import DictConfig, OmegaConf
from tqdm import tqdm
from hydra import compose, initialize
import itertools


In [2]:
sys.path.append("../")
from src.models.model_head import HistoricalTextDatingModel, create_model_head_config
from src.utils import init_tracker, DataLoadAndFilter
import transformers

In [3]:
config_path = "../configs/defaults.yaml"
if not os.path.exists(config_path):
    raise FileNotFoundError(f"Config path does not exist: {config_path}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
with initialize(version_base=None, config_path="../configs"):
    cfg = compose(config_name="defaults")


### Load model and data

In [5]:

# Load the model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
encoder = transformers.AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-multilingual-cased")

# Load datasets using the data loader
data_loader = DataLoadAndFilter(cfg)
train_dataset, eval_dataset = data_loader.load_datasets(base_path="../")

Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Error loading schema ../data/raw/SefariaData/Sefaria-Export-master/schemas/Sheet.json: Expecting value: line 1 column 1 (char 0)
Loading Sefaria texts: 100%|██████████| 6549/6549 [00:14<00:00, 460.24it/s] 
Loading Ben Yehuda texts: 21208it [00:03, 5604.92it/s]
Loading Royal Society texts: 

### Evaluation


In [7]:
def rank_candidates_by_mask_likelihood(sentence_with_mask, candidates, tokenizer, model):
    """
    Given a sentence with one mask token and a list of candidate words,
    returns the candidates sorted by model likelihood for the mask position.
    """
    # Tokenize input
    inputs = tokenizer(sentence_with_mask, return_tensors="pt")
    mask_token_index = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1].item()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0, mask_token_index]
    # Get scores for each candidate
    candidate_scores = []
    for word in candidates:
        token_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word)[0])
        score = logits[token_id].item()
        candidate_scores.append((word, score))
    # Sort by score descending
    return sorted(candidate_scores, key=lambda x: x[1], reverse=True)


In [8]:
sentence = f"This text was written in {tokenizer.mask_token}."
candidates = ["1800", "1900", "2000", "1500", "2025"]
ranked = rank_candidates_by_mask_likelihood(sentence, candidates, tokenizer, encoder)
print(ranked)

[('2000', 4.284444332122803), ('1900', 3.8327479362487793), ('1500', 3.5688230991363525), ('1800', 2.058945655822754), ('2025', 0.636199414730072)]


In [18]:

def batched(iterable, n):
    """Yield batches of n items from iterable."""
    it = iter(iterable)
    while True:
        batch = list(itertools.islice(it, n))
        if not batch:
            break
        yield batch

In [30]:
correct_count = 0
total_count = 0
candidates = list(map(str, data_loader.unique_date_ranges))
question = f"This text was written in year"
seperator = "\n"
question_format = "{text}. {question} {year}."
for items in tqdm(batched(train_dataset, 3)):
    if len(items) < 3:
        continue
    examples = items[:2]
    test = items[2]
    test['text'] = test['text'][:500 // 3]  # Truncate to fit model input size
    exapmle_strings = []
    for item in examples:
        text = item['text']
        year = item['comp_date']
        text = text[:500 // 3]  # Truncate to fit model input size
        sentence = question_format.format(text=text, question=question, year=year)
        exapmle_strings.append(sentence)
    exapmle_strings.append(question_format.format(text=test['text'], question=question, year=tokenizer.mask_token))
    ranked = rank_candidates_by_mask_likelihood(seperator.join(exapmle_strings), candidates, tokenizer, encoder)
    top_choice = ranked[0][0]
    if top_choice == str(year):
        correct_count += 1
    total_count += 1
print(f"Accuracy: {correct_count}/{total_count} = {correct_count/total_count:.2%}")

3904it [06:13, 10.45it/s]

Accuracy: 3298/3903 = 84.50%





Over train dataset:   
Accuracy: 3298/3903 = 84.50%

In [31]:
100 / (len(data_loader.unique_date_ranges) // 3)

1.8867924528301887