In [1]:
import re
import os
import unicodedata
import evaluate
import pandas as pd
import itertools

import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    BertTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForMaskedLM,
    BertForMaskedLM
)
from sentence_transformers import SentenceTransformer

from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def batched(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [3]:
def normalize_text(text):
    # Rimuovere caratteri invisibili come spazi non separabili
    text = text.replace('\xa0', ' ')  # Sostituire \xa0 con uno spazio
    # Normalizzazione Unicode
    text = unicodedata.normalize('NFKC', text)
    # Rimuovere eventuali spazi extra all'inizio e alla fine
    text = text.strip()
    # Rimuovere caratteri invisibili come nuove righe e tabulazioni
    text = re.sub(r'\s+', ' ', text)  # Sostituire sequenze di spazi con un singolo spazio
    return text

def load_data(lang, split, setting):
    assert setting in ["target", "target+random"]
    possbile_langs = ["latin", "greek", "latin+greek", "greek+latin"]
    assert lang in possbile_langs, f"Invalid language setting. Choose from {possbile_langs}."
    if lang == "latin":
        with open("data/latin_random_sentences.txt", "r") as f:
            random_1k = f.readlines()
        if split == "silver":
            qnr_df = pd.read_csv('data/Latin_benchmark.txt', sep="\t")
        elif split == "gold":
            qnr_df = pd.read_csv('data/Latin_benchmark_parallel.txt', sep="\t")
    elif lang == "greek":
        with open("data/greek_random_sentences.txt", "r") as f:
            random_1k = f.readlines()
        if split == "silver":
            qnr_df = pd.read_csv('data/Greek_benchmark.txt', sep="\t")
        elif split == "gold":
            qnr_df = pd.read_csv('data/Greek_benchmark_parallel.txt', sep="\t")
    elif lang in ["latin+greek", "greek+latin"]:
        if split == "silver":
            qnr_df_greek = pd.read_csv('data/Greek_benchmark.txt', sep="\t")
            qnr_df_latin = pd.read_csv('data/Latin_benchmark.txt', sep="\t")
        elif split == "gold":
            qnr_df_greek = pd.read_csv('data/Greek_benchmark_parallel.txt', sep="\t")
            qnr_df_latin = pd.read_csv('data/Latin_benchmark_parallel.txt', sep="\t")
        if lang.startswith("latin"):
            with open("data/latin_random_sentences.txt", "r") as f:
                random_1k = f.readlines()
            qnr_df = pd.concat(
                [qnr_df_greek.loc[:, ["Query"]], qnr_df_latin.loc[:, [f"Target #{i}" for i in range(1,6)]]], axis=1)
        elif lang.startswith("greek"):
            with open("data/greek_random_sentences.txt", "r") as f:
                random_1k = f.readlines()
            qnr_df = pd.concat(
                [qnr_df_latin.loc[:, ["Query"]], qnr_df_greek.loc[:, [f"Target #{i}" for i in range(1,6)]]], axis=1)
    else:
        raise ValueError(f"Invalid language setting. Choose from {possible_langs}.")


    future_df = []
    data_pool = []
    name_pool = []
    queries = qnr_df["Query"].tolist()
    # targets = qnr_df["target"].tolist()
    for idx, query in enumerate(queries):
        for jdx, targets in enumerate(qnr_df[[f"Target #{i}" for i in range(1, 6)]].values.tolist()):
            if setting == "target+random" and jdx != idx:
                continue
            for tdx, target in enumerate(targets):
                newrow = {}
                item_name = f"{jdx}_{tdx}_target"
                newrow["query"] = idx
                newrow["q0"] = 0
                newrow["docid"] = item_name
                newrow["rel"] = 1 if jdx == idx else 0
                future_df.append(newrow)
                if (setting == "target" and idx == 0) or setting == "target+random":
                    data_pool.append(normalize_text(target))
                    name_pool.append(item_name)

        if setting == "target+random":
            for jdx, text in enumerate(random_1k):
                item_name = f"{jdx}_random"
                newrow = {}
                newrow["query"] = idx
                newrow["q0"] = 0
                newrow["docid"] = item_name
                newrow["rel"] = 0
                future_df.append(newrow)
                if idx == 0:
                    # Aggiungi le frasi casuali solo per il primo query
                    data_pool.append(normalize_text(text.strip()))
                    name_pool.append(item_name)

    future_df = [{"system": f"{lang}_{split}_{setting}", **row} for row in future_df]
    df = pd.DataFrame(future_df)

    return queries, data_pool, df, name_pool


In [4]:
def get_model(model_type, model_name, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    #load tokenizer and model
    if model_type == "roberta":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

    elif model_type == "sentence-transformers":
        model = SentenceTransformer(model_name).to(device)
        tokenizer = None

    elif model_type == "bert":
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name).to(device)

    elif model_type == "t5":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).get_encoder().to(device)
    return model, tokenizer


In [5]:
def encode(model, tokenizer, texts, model_type):
    out = []
    for text_batch in batched(texts, n=32):
        with torch.no_grad():
            if model_type == "sentence-transformers":
                embeddings = model.encode(text_batch)

            elif model_type == "roberta":
                batch = tokenizer(
                    text_batch,
                    padding=True,
                    truncation=True,
                    return_tensors="pt",
                    max_length=512,
                ).to(model.device)
                # For models like BERT, RoBERTa, T5, etc.
                batch = {k: v.to(model.device) for k, v in batch.items()}
                outputs = model(**batch, output_hidden_states=True)
                embeddings = outputs.hidden_states[-1][:, 0, :].cpu().numpy()  # Use the last hidden state of the CLS token
                # Use the first token (CLS token) for classification tasks
        out.append(embeddings)
    embeddings = np.concatenate(out, axis=0)
    return embeddings

In [6]:
def compute_cosine_similarity(a, b):
    a = a / np.linalg.norm(a, axis=1, keepdims=True)
    b = b / np.linalg.norm(b, axis=1, keepdims=True)
    similarity_score = np.dot(a, b.T)
    return similarity_score

In [7]:
def prepare_df(df, name_pool, query_pool_similarity, model_args):
    df["q0"] = df["q0"].astype(str)
    newdf = df.copy()
    df = df.drop("system", axis=1)

    newdf["system"] = newdf["system"].astype(str) + ("_" + model_args["model_name"].replace("/", "_"))
    newdf["score"] = [query_pool_similarity[row["query"], name_pool.index(row["docid"])] for _, row in newdf.iterrows()]
    newdf["score"] = newdf["score"].astype("Float32")
    newdf["rank"] = newdf["score"].rank(method="first", ascending=False).astype(int) - 1
    if "rel" in newdf.columns:
        newdf = newdf.drop("rel", axis=1)

    newdf = newdf.sort_values(by=["query", "rank"]).reset_index(drop=True)
    newdf = newdf.loc[:, [i for i in newdf.columns if i != "system"] + ["system"]]
    return df, newdf

In [8]:
def run_experiments(experiments):
    predictions = []
    references = []
    for lang, split, setting, model_args in tqdm(experiments):
        queries, data_pool, df, name_pool = load_data(lang, split, setting=setting)

        # check data format
        if split == "silver":
            if setting == "target+random":
                assert len(queries) == 100
                assert len(data_pool) == 1500, f"{len(data_pool)}"
            elif setting == "target":
                assert len(queries) == 100
                assert len(data_pool) == 500, f"{len(data_pool)}"
        elif split == "gold":
            if setting == "target+random":
                assert len(queries) == 20
                assert len(data_pool) == 1100, f"{len(data_pool)}"
            elif setting == "target":
                assert len(queries) == 20
                assert len(data_pool) == 100, f"{len(data_pool)}"

        model, tokenizer = get_model(device="cuda:2", **model_args)

        encoded_queries = encode(model, tokenizer, queries, model_type=model_args["model_type"])
        encoded_pool = encode(model, tokenizer, data_pool, model_type=model_args["model_type"])
        query_pool_similarity = compute_cosine_similarity(encoded_queries, encoded_pool)
        assert query_pool_similarity.shape == (len(queries), len(data_pool)), f"{query_pool_similarity.shape} != {(len(queries), len(data_pool))}"

        df, newdf = prepare_df(df, name_pool, query_pool_similarity, model_args)

        os.makedirs("qrels", exist_ok=True)
        os.makedirs("runs", exist_ok=True)
        df.to_csv(f"qrels/{lang}_{split}_{setting}_{model_args['model_name'].replace('/', '_')}.tsv", sep="\t", index=False)
        newdf.to_csv(f"runs/{lang}_{split}_{setting}_{model_args['model_name'].replace('/', '_')}_run.tsv", sep="\t", index=False)
        qrel = df.to_dict(orient="list")
        run = newdf.to_dict(orient="list")

        # metric.add(predictions=run, references=qrel)
        predictions.append(run)
        references.append(qrel)
    return predictions, references



In [9]:
pretrained_model_args = [
    {"model_type": "roberta", "model_name":"bowphs/GreBerta"},
    {"model_type": "roberta", "model_name":"bowphs/LaBerta"},
    {"model_type": "roberta", "model_name":"bowphs/PhilBERTa"},
    {"model_type": "sentence-transformers", "model_name":"bowphs/SPhilBERTa"}
]
langs = ["latin", "greek", "latin+greek", "greek+latin"]
splits = ["silver", "gold"]
settings = ["target", "target+random"]
pretrained_experiments = itertools.product(langs, splits, settings, pretrained_model_args)
pretrained_experiments = list(pretrained_experiments)

predictions, references = run_experiments(pretrained_experiments)

100%|██████████| 64/64 [13:08<00:00, 12.32s/it]


In [10]:
results = {}
for pred, qrel in zip(predictions, references):
    metric = evaluate.load("trec_eval")
    metric.add(predictions=pred, references=qrel)
    scores = metric.compute()
    run_name = scores.pop("runid")
    results[run_name] = scores
results = pd.DataFrame.from_dict(results)
results.to_csv("results.csv", index=True, sep="\t")
results.to_excel("results.xlsx", index=True)

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first()

ModuleNotFoundError: No module named 'openpyxl'

In [12]:
finetuned_model_args = [
    {"model_type": "roberta", "model_name": "models/SimCSE25/simcse_sphilberta_model"},
    {"model_type": "roberta", "model_name": "models/SimCSE21/simcse_sphilberta_model"},
]

finetuned_experiments = list(itertools.product(langs, splits, settings, finetuned_model_args))
finetuned_predictions, finetuned_references = run_experiments(finetuned_experiments)

  0%|          | 0/32 [00:00<?, ?it/s]Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at models/SimCSE25/simcse_sphilberta_model and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 1/32 [00:05<02:57,  5.71s/it]Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at models/SimCSE21/simcse_sphilberta_model and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▋         | 2/32 [00:11<02:53,  5.78s/it]Some weights of RobertaForMaskedLM were not initialized fr

In [13]:
finetuned_results = {}
for pred, qrel in zip(finetuned_predictions, finetuned_references):
    metric = evaluate.load("trec_eval")
    metric.add(predictions=pred, references=qrel)
    scores = metric.compute()
    run_name = scores.pop("runid")
    finetuned_results[run_name] = scores
finetuned_results = pd.DataFrame.from_dict(finetuned_results)
finetuned_results.to_csv("finetuned_results.csv", index=True, sep="\t")
finetuned_results.to_excel("finetuned_results.xlsx", index=True)

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first()