In [None]:
pip install python-terrier
pip install translate
pip install transquest
pip install deep-translator
pip install translation-quality-estimator
pip install tqdm

Add IR database

In [1]:
import pyterrier as pt
from typing import Callable
import pandas as pd
from pathlib import Path
from deep_translator import GoogleTranslator
from tqe import TQE
from tqdm import tqdm
import string

if not pt.started():
    pt.init()
de_database = pt.datasets.get_dataset("irds:mmarco/de/dev/small")
es_database = pt.datasets.get_dataset("irds:mmarco/es/dev/small")
fr_database = pt.datasets.get_dataset("irds:mmarco/fr/dev/small")
id_database = pt.datasets.get_dataset("irds:mmarco/id/dev/small")
pt_database = pt.datasets.get_dataset("irds:mmarco/pt/dev/small")

de_queries = de_database.get_topics()["query"].tolist()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(de_database.get_topics())
    print(es_database.get_topics())

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



          qid                                              query
0     1048585                         was ist paula deens bruder
1           2                        androgenrezeptor definieren
2      524332                    behandlung von ohne medikamente
3     1048642                                was ist paranoid sc
4      524447               behandlung von krampfadern in beinen
5      786674                     was ist die primrate in kanada
6     1048876                 wer spielt junge dr stock auf ncis
7     1048917            was ist betriebssystem miskonfiguration
8      786786                         das was priorit tspass ist
9      524699                        nummer des tricare services
10    1048995            der die geozentrische theorie vorschlug
11     786857                                            was ist
12     524722           begriffsbestimmung f r trikuspidatresien
13     873886        aus welchem level entwickelt sich der zubat
14     524733            

In [3]:
# Get translated queries and score
translated_de_to_es_queries = []

try:
    with open("translated_de_to_es_queries.txt", "r") as f:
        for line in f:
            query = line.strip()
            modified_query = "".join([x if x.isalnum() else " " for x in query])
            # translator = str.maketrans("", "", string.punctuation)
            # modified_query = query.translate(translator)
            translated_de_to_es_queries.append(modified_query)
except Exception as e:
    translator = GoogleTranslator(source="de", target="es")
    for de_query in tqdm(de_queries):
        query = translator.translate(de_query)
        modified_query = "".join([x if x.isalnum() else "" for x in query])
        translated_de_to_es_queries.append(modified_query)

with open("translated_de_to_es_queries.txt", "w") as f:
    for q in translated_de_to_es_queries:
        f.write(q + "\n")

# model = TQE()
# translate_score = model.fit(de_queries, translated_de_to_es_queries)
# print(sum(translate_score) / len(translate_score))

In [4]:
IDX_PATH = Path("es_database_index").absolute()

def evaluate(df: pd.DataFrame, dataset, rewrite_func: Callable[[str], str] = None) -> float:
    if not (IDX_PATH / "data.properties").is_file():
        pt.index.IterDictIndexer(
            str(IDX_PATH)
        ).index(dataset.get_corpus_iter())
        # indexer = pt.IterDictIndexer(str(IDX_PATH))
        # indexref = indexer.index(dataset.get_corpus_iter())

    bm25 = pt.BatchRetrieve(str(IDX_PATH), wmodel="BM25")
    tfidf = pt.BatchRetrieve(str(IDX_PATH), wmodel="TF_IDF")
    return pt.Experiment(
        [tfidf, bm25],
        df,
        dataset.get_qrels(),
        eval_metrics=["map"],
    )["map"]

In [5]:
# Evaluate querying score
test_topics = de_database.get_topics()
print(evaluate(es_database.get_topics().head(100), es_database))
test_topics["query"] = translated_de_to_es_queries
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(test_topics)
print(evaluate(test_topics.head(100), es_database))

18:50:28.599 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2,2 GiB of memory would be required.
18:50:29.907 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2,2 GiB of memory would be required.
0    0.117866
1    0.091066
Name: map, dtype: float64
          qid                                              query
0     1048585                   Cuál es el hermano de Paula Deen
1           2                     Definir receptor de andrógenos
2      524332                         tratamiento sin medicación
3     1048642                                Qué es SC paranoico
4      524447          Tratamiento de las varices en las piernas
5      786674             Cuál es la tasa preferencial en Canadá
6     104