# PyTerrier retrieval

In [1]:
from src.base.pipeline import Pipeline
from src.runner import Runner
from typing import Tuple, Union, List
from arqmath_code.Entities.Post import Answer
from arqmath_code.topic_file_reader import Topic

In [2]:
from src import init_data
topic_reader, data_reader = init_data(task=1)

reading users
reading comments
reading votes
reading post links
reading posts


In [None]:
import multiprocessing
from src.pre_processors.default_pre_processing_steps import remove_xml_tags
from typing import Union
from arqmath_code.Entities.Post import Question
from src.base.pre_processing import PreProcessor

class RemoveXMLTagsFromDocumentBody(PreProcessor):

    def task(self, document: Union[Question, Answer]):
        document.body = remove_xml_tags(document.body)

    def forward(self, queries: List[Topic], documents: List[Union[Question, Answer]]) -> List[Union[Question, Answer]]:
        pool_obj = multiprocessing.Pool()
        return_value = pool_obj.map(self.task,range(0,5))
        return documents

In [5]:
from src.pre_processors.default_pre_processing_steps import tokenize_text

class NLTKTokenizationAndStopwordRemoval(PreProcessor):
    def forward(self, queries: List[Topic], documents: List[Union[Question, Answer]]) -> List[Union[Question, Answer]]:
        for document in documents:
            document.body = tokenize_text(document.body)
        return documents

In [6]:
from src.base.query_pre_processor import QueryPreProcessor

class NLTKTokenizationAndStopwordRemovalForQueries(QueryPreProcessor):
    def forward(self, queries: List[Topic]) -> List[Topic]:
        for topic in queries:
            topic.question = tokenize_text(topic.question)
        return queries

In [7]:
class RemoveXMLTagsFromQueries(QueryPreProcessor):
    def forward(self, queries: List[Topic]) -> List[Topic]:
        for topic in queries:
            topic.question = remove_xml_tags(topic.question)
        return queries

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from typing import Tuple, Union, List

from pandas import DataFrame

from arqmath_code.Entities.Post import Answer, Question
from arqmath_code.topic_file_reader import Topic
from src.base.model import Model


class LatentDirichletAllocationModel(Model):

    def __init__(self):
        pass

    def forward(self, queries: List[Topic], documents: List[Union[Question, Answer]]) -> List[
        Tuple[Topic, Union[Question, Answer], float]]:

        if type(documents) is List[Question]:
            raise Exception("Question are not allowed for this model")

        document_dataframe: DataFrame = DataFrame([document.body for document in documents], columns=["text"])
        count_vectorizer = CountVectorizer(max_df=0.90, min_df=2, lowercase=True)
        document_term_matrix = count_vectorizer.fit_transform(document_dataframe["text"])
        print("Finished count vectorizer")

        lda = LatentDirichletAllocation(n_components=50, random_state=2, n_jobs=-1)
        document_topics = lda.fit_transform(document_term_matrix)
        print("Finished lda embedding")

        query_dataframe: DataFrame = DataFrame([topic.question for topic in queries], columns=["text"])
        query_term_matrix = count_vectorizer.transform(query_dataframe["text"])

        query_topics = lda.transform(query_term_matrix)
        cos_sims: np.ndarray = cosine_similarity(query_topics, document_topics)
        print("Finished cosine similarity calculation")

        result = []
        for i, query in enumerate(queries):
           per_query = list(zip(range(cos_sims.shape[1]), cos_sims[i,]))
           for j in per_query:
               result.append((query, documents[j[0]], j[1]))

        return result

## PyTerrier pipeline

In [9]:
from src.post_processors.top_k_filter import TopKFilter
from arqmath_code.post_reader_record import DataReaderRecord


class LDAPipeline(Pipeline):

    def __init__(self, data_reader: DataReaderRecord):
        super().__init__(data_reader)
        self.lda_model = LatentDirichletAllocationModel()
        self.top1000 = TopKFilter(k=1000)
        self.tag_remover = RemoveXMLTagsFromDocumentBody()
        self.tokenizer = NLTKTokenizationAndStopwordRemoval()
        self.query_tag_remover = RemoveXMLTagsFromQueries()
        self.query_tokenizer = NLTKTokenizationAndStopwordRemovalForQueries()

    def run(self, queries: List[Topic]) -> List[Tuple[Topic, Answer, float]]:
        all_answers = self.data_reader.get_all_answer_posts()

        print("Start document preprocessing")
        documents = self.tag_remover(queries, all_answers)
        documents = self.tokenizer(queries, documents)

        print("Start query preprocessing")
        queries = self.query_tag_remover(queries)
        queries = self.query_tokenizer(queries)

        print("Start ranking")
        ranking = self.lda_model(queries=queries, documents=documents)

        print("Start top 1000 filtering")
        ranking = self.top1000(queries=None, ranking=ranking)

        return ranking


## Run pipeline

In [10]:
from datetime import datetime
print(datetime.now())
runner = Runner(LDAPipeline, n=1, data_reader=data_reader, topic_reader=topic_reader)
ranking = runner.run("../results/model_results/lda.tsv")
print(datetime.now())
ranking

2022-11-16 16:29:20.744078
Start document preprocessing




  return BeautifulSoup(text).text


Start query preprocessing
Start ranking
Finish count vectorizer
Finished lda embedding
Finished cosine similarity calculation
Start top 100 filtering
2022-11-16 17:22:48.959602


Unnamed: 0,Topic_Id,Post_Id,Score,Run_Number,Rank
0,A.301,1797704,0.906205,0,0
1,A.301,620214,0.894116,0,1
2,A.301,428935,0.893573,0,2
3,A.301,1816040,0.889605,0,3
4,A.301,576230,0.879466,0,4
...,...,...,...,...,...
99995,A.400,468822,0.892802,0,995
99996,A.400,872753,0.892794,0,996
99997,A.400,1755768,0.892754,0,997
99998,A.400,1883749,0.892702,0,998


## Evaluation

In [11]:
from arqmath_code.evaluation.task1 import arqmath_to_prime_task1
from arqmath_code.evaluation.task1 import task1_get_results

In [12]:
qrel_dictionary = arqmath_to_prime_task1.read_qrel_to_dictionary("../arqmath_dataset/evaluation/Task 1/Qrel Files/qrel_task1_2022_official.tsv")
arqmath_to_prime_task1.convert_result_files_to_trec(submission_dir="../results/model_results/", qrel_result_dic=qrel_dictionary, prim_dir="../results/ARQmath_prim/", trec_dir="../results/ARQmath_trec/")

In [13]:
number_topics = 78
task1_get_results.get_result(trec_eval_tool="trec_eval", qre_file_path="../arqmath_dataset/evaluation/Task 1/Qrel Files/qrel_task1_2022_official.tsv", prim_result_dir="../results/ARQmath_prim/", evaluation_result_file="../results/complete_results_v1.tsv", number_topics=number_topics)

-----------
['ndcg                  ', 'A.301', '0.0613']
-----------
['ndcg                  ', 'A.303', '0.0000']
-----------
['ndcg                  ', 'A.304', '0.0000']
-----------
['ndcg                  ', 'A.305', '0.1917']
-----------
['ndcg                  ', 'A.306', '0.0000']
-----------
['ndcg                  ', 'A.307', '0.0306']
-----------
['ndcg                  ', 'A.308', '0.1428']
-----------
['ndcg                  ', 'A.309', '0.0510']
-----------
['ndcg                  ', 'A.310', '0.0507']
-----------
['ndcg                  ', 'A.312', '0.0000']
-----------
['ndcg                  ', 'A.313', '0.1026']
-----------
['ndcg                  ', 'A.314', '0.0000']
-----------
['ndcg                  ', 'A.315', '0.0000']
-----------
['ndcg                  ', 'A.316', '0.0348']
-----------
['ndcg                  ', 'A.317', '0.0671']
-----------
['ndcg                  ', 'A.318', '0.0488']
-----------
['ndcg                  ', 'A.319', '0.0760']
-----------
['