In [1]:
import re
import os
import unicodedata
import evaluate
import pandas as pd
import itertools

import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    BertTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForMaskedLM,
    AutoModel,
    BertForMaskedLM
)
from sentence_transformers import SentenceTransformer, models

from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def batched(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [3]:
def normalize_text(text):
    # Rimuovere caratteri invisibili come spazi non separabili
    text = text.replace('\xa0', ' ')  # Sostituire \xa0 con uno spazio
    # Normalizzazione Unicode
    text = unicodedata.normalize('NFKC', text)
    # Rimuovere eventuali spazi extra all'inizio e alla fine
    text = text.strip()
    # Rimuovere caratteri invisibili come nuove righe e tabulazioni
    text = re.sub(r'\s+', ' ', text)  # Sostituire sequenze di spazi con un singolo spazio
    return text

def load_data(lang, split, setting):
    assert setting in ["target", "target+random"]
    possible_langs = ["latin", "greek", "latin+greek", "greek+latin"]
    if lang == "latin":
        with open("data/latin_random_sentences.txt", "r") as f:
            random_1k = f.readlines()
        if split == "silver":
            qnr_df = pd.read_csv('data/Latin_benchmark.txt', sep="\t")
        elif split == "gold":
            qnr_df = pd.read_csv('data/Latin_benchmark_parallel.txt', sep="\t")
    elif lang == "greek":
        with open("data/greek_random_sentences.txt", "r") as f:
            random_1k = f.readlines()
        if split == "silver":
            qnr_df = pd.read_csv('data/Greek_benchmark.txt', sep="\t")
        elif split == "gold":
            qnr_df = pd.read_csv('data/Greek_benchmark_parallel.txt', sep="\t")
    elif lang in ["latin+greek", "greek+latin"]:
        if split == "silver":
            qnr_df_greek = pd.read_csv('data/Greek_benchmark.txt', sep="\t")
            qnr_df_latin = pd.read_csv('data/Latin_benchmark.txt', sep="\t")
        elif split == "gold":
            qnr_df_greek = pd.read_csv('data/Greek_benchmark_parallel.txt', sep="\t")
            qnr_df_latin = pd.read_csv('data/Latin_benchmark_parallel.txt', sep="\t")
        if lang.startswith("latin"):
            with open("data/latin_random_sentences.txt", "r") as f:
                random_1k = f.readlines()
            qnr_df = pd.concat(
                [qnr_df_greek.loc[:, ["Query"]], qnr_df_latin.loc[:, [f"Target #{i}" for i in range(1,6)]]], axis=1)
        elif lang.startswith("greek"):
            with open("data/greek_random_sentences.txt", "r") as f:
                random_1k = f.readlines()
            qnr_df = pd.concat(
                [qnr_df_latin.loc[:, ["Query"]], qnr_df_greek.loc[:, [f"Target #{i}" for i in range(1,6)]]], axis=1)
    else:
        raise ValueError(f"Invalid language setting. Choose from {possible_langs}.")


    future_df = []
    data_pool = []
    name_pool = []
    queries = qnr_df["Query"].tolist()
    # targets = qnr_df["target"].tolist()
    for idx, query in enumerate(queries):
        for jdx, targets in enumerate(qnr_df[[f"Target #{i}" for i in range(1, 6)]].values.tolist()):
            if setting == "target+random" and jdx != idx:
                continue
            for tdx, target in enumerate(targets):
                newrow = {}
                item_name = f"{jdx}_{tdx}_target"
                newrow["query"] = idx
                newrow["q0"] = 0
                newrow["docid"] = item_name
                newrow["rel"] = 1 if jdx == idx else 0
                future_df.append(newrow)
                if (setting == "target" and idx == 0) or setting == "target+random":
                    data_pool.append(normalize_text(target))
                    name_pool.append(item_name)

        if setting == "target+random":
            for jdx, text in enumerate(random_1k):
                item_name = f"{jdx}_random"
                newrow = {}
                newrow["query"] = idx
                newrow["q0"] = 0
                newrow["docid"] = item_name
                newrow["rel"] = 0
                future_df.append(newrow)
                if idx == 0:
                    # Aggiungi le frasi casuali solo per il primo query
                    data_pool.append(normalize_text(text.strip()))
                    name_pool.append(item_name)

    future_df = [{"system": f"{lang}_{split}_{setting}", **row} for row in future_df]
    df = pd.DataFrame(future_df)

    return queries, data_pool, df, name_pool


In [4]:
def get_model(model_type, model_name, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    #load tokenizer and model
    if model_type == "roberta":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name).to(device)

    elif model_type == "sentence-transformers":
        model = SentenceTransformer(model_name).to(device)
        tokenizer = None

    elif model_type == "bert":
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name).to(device)

    elif model_type == "t5":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).get_encoder().to(device)
    return model, tokenizer


In [5]:
def encode(model, tokenizer, texts, model_type, pooling_type="cls"):
    assert pooling_type in ["cls", "mean"]
    out = []
    for text_batch in tqdm(batched(texts, n=32), total=int(len(texts) / 32) + 1, desc="Encoding texts"):
        with torch.no_grad():
            if model_type == "sentence-transformers":
                embeddings = model.encode(text_batch)

            elif model_type == "roberta":
                batch = tokenizer(
                    text_batch,
                    padding=True,
                    truncation=True,
                    return_tensors="pt",
                    max_length=512,
                ).to(model.device)
                # For models like BERT, RoBERTa, T5, etc.
                batch = {k: v.to(model.device) for k, v in batch.items()}
                if pooling_type == "cls":
                    outputs = model(**batch, output_hidden_states=True)
                    embeddings = outputs.hidden_states[-1][:, 0, :].cpu().numpy()  # Use the last hidden state of the CLS token
                elif pooling_type == "mean":
                    outputs = model(**batch, output_hidden_states=True)
                    def mean_pooling(model_output, attention_mask):
                        token_embeddings = model_output.hidden_states[-1] #First element of model_output contains all token embeddings
                        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                    embeddings = mean_pooling(outputs, batch['attention_mask']).cpu().numpy()
                    # Use the mean of the last hidden states
                    # embeddings = outputs.hidden_states[-1].mean(dim=1).cpu().numpy()

                else:
                    raise ValueError(f"Pooling type {pooling_type} not recognized, must be one of 'cls', 'mean'.")
                # Use the first token (CLS token) for classification tasks
        out.append(embeddings)
    embeddings = np.concatenate(out, axis=0)
    return embeddings

In [6]:
def compute_cosine_similarity(a, b):
    a = a / np.linalg.norm(a, axis=1, keepdims=True)
    b = b / np.linalg.norm(b, axis=1, keepdims=True)
    similarity_score = np.dot(a, b.T)
    return similarity_score

In [7]:
def prepare_df(df, name_pool, query_pool_similarity, model_args):
    df["q0"] = df["q0"].astype(str)
    newdf = df.copy()
    df = df.drop("system", axis=1)

    newdf["system"] = newdf["system"].astype(str) + ("_" + model_args["model_name"].replace("/", "_"))
    newdf["score"] = [query_pool_similarity[row["query"], name_pool.index(row["docid"])] for _, row in newdf.iterrows()]
    newdf["score"] = newdf["score"].astype("Float32")
    newdf["rank"] = newdf["score"].rank(method="first", ascending=False).astype(int) - 1
    if "rel" in newdf.columns:
        newdf = newdf.drop("rel", axis=1)

    newdf = newdf.sort_values(by=["query", "rank"]).reset_index(drop=True)
    newdf = newdf.loc[:, [i for i in newdf.columns if i != "system"] + ["system"]]
    return df, newdf

In [8]:
def run_experiments(experiments):
    predictions = []
    references = []
    for lang, split, setting, model_args in tqdm(experiments):
        print(f"Running experiment for {lang} {split} {setting} with model {model_args['model_name']}")
        queries, data_pool, df, name_pool = load_data(lang, split, setting=setting)

        # check data format
        if split == "silver":
            if setting == "target+random":
                assert len(queries) == 100
                assert len(data_pool) == 1500, f"{len(data_pool)}"
            elif setting == "target":
                assert len(queries) == 100
                assert len(data_pool) == 500, f"{len(data_pool)}"
        elif split == "gold":
            if setting == "target+random":
                assert len(queries) == 20
                assert len(data_pool) == 1100, f"{len(data_pool)}"
            elif setting == "target":
                assert len(queries) == 20
                assert len(data_pool) == 100, f"{len(data_pool)}"

        model, tokenizer = get_model(device="cuda:2", **model_args)

        pooling_type = "cls" if model_args["model_name"] in ["bowphs/GreBerta", "bowphs/LaBerta"] else "mean"
        # pooling_type = "cls"
        encoded_queries = encode(model, tokenizer, queries, model_type=model_args["model_type"], pooling_type=pooling_type)
        encoded_pool = encode(model, tokenizer, data_pool, model_type=model_args["model_type"], pooling_type=pooling_type)
        query_pool_similarity = compute_cosine_similarity(encoded_queries, encoded_pool)
        assert query_pool_similarity.shape == (len(queries), len(data_pool)), f"{query_pool_similarity.shape} != {(len(queries), len(data_pool))}"

        df, newdf = prepare_df(df, name_pool, query_pool_similarity, model_args)

        os.makedirs("qrels", exist_ok=True)
        os.makedirs("runs", exist_ok=True)
        df.to_csv(f"qrels/{lang}_{split}_{setting}_{model_args['model_name'].replace('/', '_')}.tsv", sep="\t", index=False)
        newdf.to_csv(f"runs/{lang}_{split}_{setting}_{model_args['model_name'].replace('/', '_')}_run.tsv", sep="\t", index=False)
        qrel = df.to_dict(orient="list")
        run = newdf.to_dict(orient="list")

        # metric.add(predictions=run, references=qrel)
        predictions.append(run)
        references.append(qrel)
    return predictions, references



In [9]:
pretrained_model_args = [
    {"model_type": "roberta", "model_name":"bowphs/GreBerta"},
    {"model_type": "roberta", "model_name":"bowphs/LaBerta"},
    {"model_type": "roberta", "model_name":"bowphs/PhilBERTa"},
    {"model_type": "roberta", "model_name":"bowphs/SPhilBERTa"}
]
langs = ["latin", "greek", "latin+greek", "greek+latin"]
splits = ["silver", "gold"]
settings = ["target", "target+random"]
pretrained_experiments = itertools.product(langs, splits, settings, pretrained_model_args)
pretrained_experiments = list(pretrained_experiments)

predictions, references = run_experiments(pretrained_experiments)

  0%|          | 0/64 [00:00<?, ?it/s]

Running experiment for latin silver target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.37it/s]
Encoding texts: 100%|██████████| 16/16 [00:07<00:00,  2.21it/s]
  2%|▏         | 1/64 [00:17<17:51, 17.01s/it]

Running experiment for latin silver target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  5.54it/s]
Encoding texts: 100%|██████████| 16/16 [00:04<00:00,  3.63it/s]
  3%|▎         | 2/64 [00:28<14:21, 13.89s/it]

Running experiment for latin silver target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  5.13it/s]
Encoding texts: 100%|██████████| 16/16 [00:04<00:00,  3.64it/s]
  5%|▍         | 3/64 [00:41<13:25, 13.21s/it]

Running experiment for latin silver target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  5.12it/s]
Encoding texts: 100%|██████████| 16/16 [00:04<00:00,  3.72it/s]
  6%|▋         | 4/64 [00:53<12:44, 12.74s/it]

Running experiment for latin silver target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.32it/s]
Encoding texts: 100%|██████████| 47/47 [01:07<00:00,  1.44s/it]
  8%|▊         | 5/64 [02:12<36:08, 36.75s/it]

Running experiment for latin silver target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  5.13it/s]
Encoding texts: 100%|██████████| 47/47 [01:03<00:00,  1.35s/it]
  9%|▉         | 6/64 [03:25<47:27, 49.10s/it]

Running experiment for latin silver target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  5.55it/s]
Encoding texts: 100%|██████████| 47/47 [01:03<00:00,  1.35s/it]
 11%|█         | 7/64 [04:41<55:01, 57.93s/it]

Running experiment for latin silver target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  5.28it/s]
Encoding texts: 100%|██████████| 47/47 [01:03<00:00,  1.35s/it]
 12%|█▎        | 8/64 [05:56<59:08, 63.36s/it]

Running experiment for latin gold target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
Encoding texts: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]
 14%|█▍        | 9/64 [06:03<41:45, 45.56s/it]

Running experiment for latin gold target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.04it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.25it/s]
 16%|█▌        | 10/64 [06:07<29:42, 33.01s/it]

Running experiment for latin gold target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.41it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.14it/s]
 17%|█▋        | 11/64 [06:13<21:38, 24.51s/it]

Running experiment for latin gold target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.17it/s]
 19%|█▉        | 12/64 [06:18<16:12, 18.70s/it]

Running experiment for latin gold target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
Encoding texts: 100%|██████████| 35/35 [01:02<00:00,  1.78s/it]
 20%|██        | 13/64 [07:25<28:17, 33.28s/it]

Running experiment for latin gold target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.10it/s]
Encoding texts: 100%|██████████| 35/35 [01:01<00:00,  1.76s/it]
 22%|██▏       | 14/64 [08:32<36:07, 43.34s/it]

Running experiment for latin gold target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.83it/s]
Encoding texts: 100%|██████████| 35/35 [01:02<00:00,  1.77s/it]
 23%|██▎       | 15/64 [09:39<41:21, 50.64s/it]

Running experiment for latin gold target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
Encoding texts: 100%|██████████| 35/35 [01:01<00:00,  1.77s/it]
 25%|██▌       | 16/64 [10:47<44:37, 55.78s/it]

Running experiment for greek silver target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.92it/s]
Encoding texts: 100%|██████████| 16/16 [00:05<00:00,  2.86it/s]
 27%|██▋       | 17/64 [11:00<33:41, 43.01s/it]

Running experiment for greek silver target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.43it/s]
Encoding texts: 100%|██████████| 16/16 [00:09<00:00,  1.70it/s]
 28%|██▊       | 18/64 [11:18<27:11, 35.47s/it]

Running experiment for greek silver target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.68it/s]
Encoding texts: 100%|██████████| 16/16 [00:05<00:00,  2.82it/s]
 30%|██▉       | 19/64 [11:32<21:48, 29.07s/it]

Running experiment for greek silver target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.88it/s]
Encoding texts: 100%|██████████| 16/16 [00:05<00:00,  2.82it/s]
 31%|███▏      | 20/64 [11:46<17:56, 24.45s/it]

Running experiment for greek silver target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.05it/s]
Encoding texts: 100%|██████████| 47/47 [00:44<00:00,  1.06it/s]
 33%|███▎      | 21/64 [12:42<24:21, 34.00s/it]

Running experiment for greek silver target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.55it/s]
Encoding texts: 100%|██████████| 47/47 [00:58<00:00,  1.25s/it]
 34%|███▍      | 22/64 [13:53<31:28, 44.97s/it]

Running experiment for greek silver target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.45it/s]
Encoding texts: 100%|██████████| 47/47 [00:45<00:00,  1.03it/s]
 36%|███▌      | 23/64 [14:49<32:59, 48.27s/it]

Running experiment for greek silver target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.86it/s]
Encoding texts: 100%|██████████| 47/47 [00:45<00:00,  1.04it/s]
 38%|███▊      | 24/64 [15:46<33:54, 50.87s/it]

Running experiment for greek gold target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  5.24it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.73it/s]
 39%|███▉      | 25/64 [15:50<23:59, 36.92s/it]

Running experiment for greek gold target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
Encoding texts: 100%|██████████| 4/4 [00:02<00:00,  1.61it/s]
 41%|████      | 26/64 [15:55<17:24, 27.49s/it]

Running experiment for greek gold target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  5.21it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.55it/s]
 42%|████▏     | 27/64 [16:00<12:47, 20.73s/it]

Running experiment for greek gold target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.74it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.65it/s]
 44%|████▍     | 28/64 [16:05<09:34, 15.96s/it]

Running experiment for greek gold target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  6.20it/s]
Encoding texts: 100%|██████████| 35/35 [00:39<00:00,  1.13s/it]
 45%|████▌     | 29/64 [16:49<14:13, 24.40s/it]

Running experiment for greek gold target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.86it/s]
Encoding texts: 100%|██████████| 35/35 [00:52<00:00,  1.51s/it]
 47%|████▋     | 30/64 [17:47<19:25, 34.27s/it]

Running experiment for greek gold target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  5.23it/s]
Encoding texts: 100%|██████████| 35/35 [00:47<00:00,  1.36s/it]
 48%|████▊     | 31/64 [18:39<21:48, 39.66s/it]

Running experiment for greek gold target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
Encoding texts: 100%|██████████| 35/35 [00:51<00:00,  1.47s/it]
 50%|█████     | 32/64 [19:36<23:54, 44.84s/it]

Running experiment for latin+greek silver target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.55it/s]
Encoding texts: 100%|██████████| 16/16 [00:08<00:00,  1.90it/s]
 52%|█████▏    | 33/64 [19:52<18:45, 36.31s/it]

Running experiment for latin+greek silver target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.11it/s]
Encoding texts: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]
 53%|█████▎    | 34/64 [20:06<14:47, 29.57s/it]

Running experiment for latin+greek silver target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.26it/s]
Encoding texts: 100%|██████████| 16/16 [00:05<00:00,  3.15it/s]
 55%|█████▍    | 35/64 [20:20<11:58, 24.78s/it]

Running experiment for latin+greek silver target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.20it/s]
Encoding texts: 100%|██████████| 16/16 [00:05<00:00,  3.13it/s]
 56%|█████▋    | 36/64 [20:34<10:08, 21.73s/it]

Running experiment for latin+greek silver target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.41it/s]
Encoding texts: 100%|██████████| 47/47 [01:17<00:00,  1.65s/it]
 58%|█████▊    | 37/64 [22:04<18:58, 42.17s/it]

Running experiment for latin+greek silver target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.16it/s]
Encoding texts: 100%|██████████| 47/47 [01:13<00:00,  1.56s/it]
 59%|█████▉    | 38/64 [23:31<24:04, 55.56s/it]

Running experiment for latin+greek silver target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.30it/s]
Encoding texts: 100%|██████████| 47/47 [01:13<00:00,  1.57s/it]
 61%|██████    | 39/64 [24:57<26:59, 64.77s/it]

Running experiment for latin+greek silver target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  3.36it/s]
Encoding texts: 100%|██████████| 47/47 [01:13<00:00,  1.57s/it]
 62%|██████▎   | 40/64 [26:23<28:24, 71.02s/it]

Running experiment for latin+greek gold target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  5.11it/s]
Encoding texts: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]
 64%|██████▍   | 41/64 [26:30<19:52, 51.83s/it]

Running experiment for latin+greek gold target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
Encoding texts: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]
 66%|██████▌   | 42/64 [26:35<13:55, 37.96s/it]

Running experiment for latin+greek gold target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.04it/s]
Encoding texts: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]
 67%|██████▋   | 43/64 [26:41<09:55, 28.35s/it]

Running experiment for latin+greek gold target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.76it/s]
Encoding texts: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]
 69%|██████▉   | 44/64 [26:47<07:12, 21.64s/it]

Running experiment for latin+greek gold target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
Encoding texts: 100%|██████████| 35/35 [01:12<00:00,  2.07s/it]
 70%|███████   | 45/64 [28:05<12:09, 38.41s/it]

Running experiment for latin+greek gold target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.20it/s]
Encoding texts: 100%|██████████| 35/35 [01:12<00:00,  2.08s/it]
 72%|███████▏  | 46/64 [29:23<15:07, 50.42s/it]

Running experiment for latin+greek gold target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
Encoding texts: 100%|██████████| 35/35 [01:13<00:00,  2.09s/it]
 73%|███████▎  | 47/64 [30:42<16:42, 58.94s/it]

Running experiment for latin+greek gold target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.16it/s]
Encoding texts: 100%|██████████| 35/35 [01:12<00:00,  2.08s/it]
 75%|███████▌  | 48/64 [32:01<17:18, 64.88s/it]

Running experiment for greek+latin silver target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.70it/s]
Encoding texts: 100%|██████████| 16/16 [00:06<00:00,  2.45it/s]
 77%|███████▋  | 49/64 [32:16<12:29, 49.94s/it]

Running experiment for greek+latin silver target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.51it/s]
Encoding texts: 100%|██████████| 16/16 [00:10<00:00,  1.46it/s]
 78%|███████▊  | 50/64 [32:35<09:29, 40.68s/it]

Running experiment for greek+latin silver target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.45it/s]
Encoding texts: 100%|██████████| 16/16 [00:06<00:00,  2.35it/s]
 80%|███████▉  | 51/64 [32:50<07:09, 33.05s/it]

Running experiment for greek+latin silver target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.32it/s]
Encoding texts: 100%|██████████| 16/16 [00:06<00:00,  2.37it/s]
 81%|████████▏ | 52/64 [33:06<05:33, 27.75s/it]

Running experiment for greek+latin silver target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.69it/s]
Encoding texts: 100%|██████████| 47/47 [00:56<00:00,  1.21s/it]
 83%|████████▎ | 53/64 [34:15<07:24, 40.36s/it]

Running experiment for greek+latin silver target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.24it/s]
Encoding texts: 100%|██████████| 47/47 [01:17<00:00,  1.64s/it]
 84%|████████▍ | 54/64 [35:45<09:10, 55.04s/it]

Running experiment for greek+latin silver target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.25it/s]
Encoding texts: 100%|██████████| 47/47 [00:58<00:00,  1.24s/it]
 86%|████████▌ | 55/64 [36:56<08:59, 59.95s/it]

Running experiment for greek+latin silver target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 4/4 [00:00<00:00,  4.26it/s]
Encoding texts: 100%|██████████| 47/47 [00:58<00:00,  1.24s/it]
 88%|████████▊ | 56/64 [38:07<08:25, 63.20s/it]

Running experiment for greek+latin gold target with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
Encoding texts: 100%|██████████| 4/4 [00:01<00:00,  2.12it/s]
 89%|████████▉ | 57/64 [38:13<05:22, 46.06s/it]

Running experiment for greek+latin gold target with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.05it/s]
Encoding texts: 100%|██████████| 4/4 [00:03<00:00,  1.26it/s]
 91%|█████████ | 58/64 [38:20<03:25, 34.27s/it]

Running experiment for greek+latin gold target with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Encoding texts: 100%|██████████| 4/4 [00:02<00:00,  1.98it/s]
 92%|█████████▏| 59/64 [38:26<02:08, 25.78s/it]

Running experiment for greek+latin gold target with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Encoding texts: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]
 94%|█████████▍| 60/64 [38:32<01:19, 19.95s/it]

Running experiment for greek+latin gold target+random with model bowphs/GreBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/GreBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]
Encoding texts: 100%|██████████| 35/35 [00:47<00:00,  1.37s/it]
 95%|█████████▌| 61/64 [39:26<01:29, 29.99s/it]

Running experiment for greek+latin gold target+random with model bowphs/LaBerta


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/LaBerta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]
Encoding texts: 100%|██████████| 35/35 [01:02<00:00,  1.77s/it]
 97%|█████████▋| 62/64 [40:33<01:22, 41.11s/it]

Running experiment for greek+latin gold target+random with model bowphs/PhilBERTa


Some weights of RobertaModel were not initialized from the model checkpoint at bowphs/PhilBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  3.86it/s]
Encoding texts: 100%|██████████| 35/35 [00:43<00:00,  1.25s/it]
 98%|█████████▊| 63/64 [41:22<00:43, 43.51s/it]

Running experiment for greek+latin gold target+random with model bowphs/SPhilBERTa


Encoding texts: 100%|██████████| 1/1 [00:00<00:00,  4.30it/s]
Encoding texts: 100%|██████████| 35/35 [00:39<00:00,  1.13s/it]
100%|██████████| 64/64 [42:06<00:00, 39.48s/it]


In [10]:
results = {}
for pred, qrel in zip(predictions, references):
    metric = evaluate.load("trec_eval")
    metric.add(predictions=pred, references=qrel)
    scores = metric.compute()
    run_name = scores.pop("runid")
    results[run_name] = scores
results = pd.DataFrame.from_dict(results)
results.to_csv("results.csv", index=True, sep="\t")
results.to_excel("results.xlsx", index=True)

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first()

In [None]:
finetuned_model_args = [
    {"model_type": "roberta", "model_name": "models/SimCSE25/simcse_sphilberta_model"},
    {"model_type": "roberta", "model_name": "models/SimCSE21/simcse_sphilberta_model"},
]

finetuned_experiments = list(itertools.product(langs, splits, settings, finetuned_model_args))
finetuned_predictions, finetuned_references = run_experiments(finetuned_experiments)

In [None]:
finetuned_results = {}
for pred, qrel in zip(finetuned_predictions, finetuned_references):
    metric = evaluate.load("trec_eval")
    metric.add(predictions=pred, references=qrel)
    scores = metric.compute()
    run_name = scores.pop("runid")
    finetuned_results[run_name] = scores
finetuned_results = pd.DataFrame.from_dict(finetuned_results)
finetuned_results.to_csv("finetuned_results.csv", index=True, sep="\t")
finetuned_results.to_excel("finetuned_results.xlsx", index=True)