# Base example

In [41]:
from src import init_data
from typing import List, Union, Tuple
from src.pyterrier.pyterrier_math_formula_coding import translate_latex
import pyterrier as pt
import os
from src.pyterrier.config import ROOT_DIRECTORY
from arqmath_code.Entities.Post import Question, Answer
from arqmath_code.topic_file_reader import Topic
from src.base.model import Model
from src.base.pipeline import Pipeline
from src.base.post_processor import PostProcessor
from src.base.pre_processing import PreProcessor
from src.runner import Runner
from pandas import DataFrame

In [None]:
def get_pyterrier_answer_dict(documents: List[Answer]):
    for answer in documents:
        if answer.body is not None and answer.post_id is not None and answer.parent_id is not None and answer.votes is not None:
            indexed_body = translate_latex(answer.body)

            yield {'docno': answer.post_id,
                   'text': indexed_body,
                   'origtext': answer.body,
                   'parentno': answer.parent_id,
                   'votes': answer.votes
                   }

def create_pyterrier_index(documents: List[Answer], index_name: str = "arqmath_indexV1"):
    index_path = f"{ROOT_DIRECTORY}/index/{index_name}"
    print(index_path)
    if not pt.started():
        pt.init()
    if not os.path.exists(index_path + "/data.properties"):
        indexer = pt.index.IterDictIndexer(index_path)
        index_reference = indexer.index(get_pyterrier_answer_dict(documents))
        pass
    else:
        print("Loading existing index")
        index_reference = pt.IndexRef.of(index_path + "/data.properties")
        pass

    index = pt.IndexFactory.of(index_reference)
    return index

## Create a model

In [None]:
import pandas as pd


class BasicPyTerrierModel(Model):

    def __init__(self, pyterrier_model: str = "TF_IDF"):
        self.pyterrier_index = None
        self.pyterrier_model = pyterrier_model
        pass


    def forward(self, queries: List[Topic], documents: List[Union[Question, Answer]]) -> List[
        Tuple[Topic, Union[Question, Answer], float]]:

        if type(documents) is List[Question]:
            raise Exception("Question can't be indexed using the basic PyTerrier model")

        if not pt.started():
            pt.init()

        self.pyterrier_index = create_pyterrier_index(documents=documents, index_name="pyterrier_answer_index_task1_v1")

        batch_retrieve = pt.BatchRetrieve(self.pyterrier_index, wmodel=self.pyterrier_model)
        queries_data_frame = self.get_pyterrier_query_dict(queries)
        ranking: DataFrame = batch_retrieve.transform(queries_data_frame)

        documents_to_include = set([answer.post_id for answer in documents])
        documents_dict = {answer.post_id: answer for answer in documents}
        queries_dict = {topic.topic_id: topic for topic in queries}

        result = []
        for index, row in ranking.iterrows():
            if int(row["docno"]) in documents_to_include:
                topic = queries_dict[row["qid"]]
                answer = documents_dict[int(row["docno"])]
                result.append((topic, answer, row["score"]))

        return result

    def get_pyterrier_query_dict(self, queries: List[Topic]):
        query_dict = []
        for topic in queries:
            query_dict.append( {'qid': topic.topic_id,'query': translate_latex(topic.question)})
        queries_data_frame = pd.DataFrame(query_dict)
        return queries_data_frame

# Post-Processor example

In [None]:
from itertools import groupby
from typing import Tuple, List, Union

from arqmath_code.Entities.Post import Answer, Question
from arqmath_code.topic_file_reader import Topic
from src.base.post_processor import PostProcessor


class TopKFilter(PostProcessor):

    def __init__(self, k=1000):
        self.k = k
        pass

    def forward(self, queries: List[Topic], ranking: List[Tuple[Topic, Union[Question, Answer], float]]) -> List[
        Tuple[Topic, Union[Question, Answer], float]]:
        result = []
        for key, group in groupby(ranking, key=lambda topic: topic[0].topic_id):
            group = list(group)
            sorted_group = sorted(group, key=lambda tuple: tuple[2], reverse=True)[:self.k]
            result.extend(sorted_group)
        return result

## Pipeline example

In [None]:
from arqmath_code.post_reader_record import DataReaderRecord


class PyTerrierBasicPipeline(Pipeline):

    def __init__(self, data_reader: DataReaderRecord):
        super().__init__(data_reader)
        if not pt.started():
            pt.init()
        self.pyterrier_model = BasicPyTerrierModel(pyterrier_model="TF_IDF")
        self.top1000 = TopKFilter()

    def run(self, queries: List[Topic]) -> List[Tuple[Topic, Answer, float]]:
        all_answers = list(self.data_reader.post_parser.map_just_answers.values())
        ranking = self.pyterrier_model(queries=queries, documents=all_answers)
        ranking = self.top1000(queries=None, ranking=ranking)
        return ranking


## Runner Usage

In [34]:
from datetime import datetime
print(datetime.now())
runner = Runner(PyTerrierBasicPipeline, n=1, data_reader=data_reader, topic_reader=topic_reader)
ranking = runner.run("../results/model_results/pyterrier-tf-idf.tsv")
print(datetime.now())
ranking

2022-11-14 17:19:05.974111
/Users/I518152/Documents/GitHub/information-retrieval/index/pyterrier_answer_index_task1_v1
Loading existing index
Finished pyterrier retrieval
Finished result creation
2022-11-14 17:25:47.534815


Unnamed: 0,Topic_Id,Post_Id,Score,Run_Number,Rank
0,A.301,1523374,14.938251,0,0
1,A.301,770629,14.884188,0,1
2,A.301,598878,14.765276,0,2
3,A.301,2790948,14.747543,0,3
4,A.301,360271,14.692966,0,4
...,...,...,...,...,...
99995,A.400,2656508,11.750688,0,995
99996,A.400,174687,11.750613,0,996
99997,A.400,1087112,11.750436,0,997
99998,A.400,728117,11.750263,0,998


In [30]:
from arqmath_code.evaluation.task1 import arqmath_to_prime_task1
from arqmath_code.evaluation.task1 import task1_get_results

In [47]:
qrel_dictionary = arqmath_to_prime_task1.read_qrel_to_dictionary("../arqmath_dataset/evaluation/Task 1/Qrel Files/qrel_task1_2022_official.tsv")
arqmath_to_prime_task1.convert_result_files_to_trec(submission_dir="../results/model_results/", qrel_result_dic=qrel_dictionary, prim_dir="../results/ARQmath_prim/", trec_dir="../results/ARQmath_trec/")

In [48]:
number_topics = 78
task1_get_results.get_result(trec_eval_tool="trec_eval", qre_file_path="../arqmath_dataset/evaluation/Task 1/Qrel Files/qrel_task1_2022_official.tsv", prim_result_dir="../results/ARQmath_prim/", evaluation_result_file="../results/complete_results_v1.tsv", number_topics=number_topics)

-----------
['ndcg                  ', 'A.301', '0.1866']
-----------
['ndcg                  ', 'all', '0.1866']
-----------
['ndcg                  ', 'A.301', '0.1593']
-----------
['ndcg                  ', 'A.302', '0.0000']
-----------
['ndcg                  ', 'A.303', '0.0000']
-----------
['ndcg                  ', 'A.304', '0.0000']
-----------
['ndcg                  ', 'A.305', '0.0000']
-----------
['ndcg                  ', 'A.306', '0.0406']
-----------
['ndcg                  ', 'A.307', '0.0327']
-----------
['ndcg                  ', 'A.308', '0.0115']
-----------
['ndcg                  ', 'A.309', '0.0482']
-----------
['ndcg                  ', 'A.310', '0.0914']
-----------
['ndcg                  ', 'A.312', '0.1442']
-----------
['ndcg                  ', 'A.313', '0.0277']
-----------
['ndcg                  ', 'A.314', '0.0183']
-----------
['ndcg                  ', 'A.315', '0.0486']
-----------
['ndcg                  ', 'A.316', '0.0000']
-----------
['nd