# Cross-Encoders

In [1]:
from typing import List, Union, Tuple
from arqmath_code.Entities.Post import Question, Answer
from arqmath_code.topic_file_reader import Topic
from src import init_data
topic_reader, data_reader = init_data(task=1)

reading users
reading comments
reading votes
reading post links
reading posts


In [2]:
from arqmath_code.post_reader_record import DataReaderRecord
from src.post_processors.top_k_filter import TopKFilter
from src.post_processors.answer_score_retriever_for_questions import AnswerScoreRetrieverForQuestions
from src.sbert.question_s_bert import QuestionSBERT
from src.base.pipeline import Pipeline
from src.sbert.cross_encoder import SBertCrossEncoder


class SBertPipelineWithCrossEncoder(Pipeline):

    def __init__(self, data_reader: DataReaderRecord):
        super().__init__(data_reader)
        self.sbert = QuestionSBERT(model_id='all-MiniLM-L6-v2')
        self.answer_score_retriever = AnswerScoreRetrieverForQuestions()
        self.top_k_filter = TopKFilter(k=5000)
        self.final_top_k_filter = TopKFilter()
        self.cross_encoder = SBertCrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')

    def run(self, queries: List[Topic]) -> List[Tuple[Topic, Answer, float]]:
        questions: List[Question] = self.data_reader.get_questions()
        ranking: List[
        Tuple[Topic, Union[Question, Answer], float]] = self.sbert(queries=queries, documents=questions)
        print("Retrieving Answers")
        ranking = self.answer_score_retriever(queries=queries, ranking=ranking)
        print("Applying Top K=5000 Filter")
        ranking = self.top_k_filter(queries=queries, ranking=ranking)
        print("Starting Cross Encoder")
        ranking = self.cross_encoder(queries=queries, ranking=ranking)
        print("Applying final Top K Filter")
        ranking = self.final_top_k_filter(queries=queries, ranking=ranking)
        return ranking

In [3]:
from src.runner import Runner
from datetime import datetime

print(datetime.now())
runner = Runner(SBertPipelineWithCrossEncoder, n=1, data_reader=data_reader, topic_reader=topic_reader)
ranking = runner.run("../results/model_results/SBert-Cross-Encoder.tsv")
print(datetime.now())
ranking

2022-11-18 07:19:33.740349
read from cached embeddings at  ../arqmath_dataset/model_embeddings/document_embeddings_all-MiniLM-L6-v2.npy


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Similarities
Retrieving Answers
Applying Top K=5000 Filter
Starting Cross Encoder


Batches:   0%|          | 0/15625 [00:00<?, ?it/s]

Applying final Top K Filter
2022-11-18 08:43:00.906637


Unnamed: 0,Topic_Id,Post_Id,Score,Run_Number,Rank
0,A.301,1308036,6.762672,0,0
1,A.301,1428161,6.686323,0,1
2,A.301,454669,6.609347,0,2
3,A.301,1845609,6.592342,0,3
4,A.301,2501014,6.449536,0,4
...,...,...,...,...,...
99995,A.400,2267915,1.394436,0,995
99996,A.400,1608708,1.392781,0,996
99997,A.400,1326825,1.391632,0,997
99998,A.400,1601393,1.391448,0,998


In [4]:
from src.pre_processors.remove_xml_tags import RemoveXMLTagsFromDocumentBody, RemoveXMLTagsFromQueries
from arqmath_code.post_reader_record import DataReaderRecord
from src.post_processors.top_k_filter import TopKFilter
from src.post_processors.answer_score_retriever_for_questions import AnswerScoreRetrieverForQuestions
from src.sbert.question_s_bert import QuestionSBERT
from src.base.pipeline import Pipeline
from src.sbert.cross_encoder import SBertCrossEncoder


class SBertPipelineWithCrossEncoderWithoutTags(Pipeline):

    def __init__(self, data_reader: DataReaderRecord):
        super().__init__(data_reader)
        self.sbert = QuestionSBERT(model_id='all-MiniLM-L6-v2')
        self.answer_score_retriever = AnswerScoreRetrieverForQuestions()
        self.top_k_filter = TopKFilter(k=5000)
        self.final_top_k_filter = TopKFilter()
        self.cross_encoder = SBertCrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
        self.xml_question_tag_remover = RemoveXMLTagsFromDocumentBody()
        self.xml_topic_tag_remover = RemoveXMLTagsFromQueries()

    def run(self, queries: List[Topic]) -> List[Tuple[Topic, Answer, float]]:
        questions: List[Question] = self.data_reader.get_questions()
        print("Removing Tags")
        questions = self.xml_question_tag_remover(queries=queries, documents=questions)
        queries = self.xml_topic_tag_remover(queries=queries)
        print("Bi-Encoding")
        ranking: List[
        Tuple[Topic, Union[Question, Answer], float]] = self.sbert(queries=queries, documents=questions)
        print("Retrieving Answers")
        ranking = self.answer_score_retriever(queries=queries, ranking=ranking)
        print("Applying Top K=5000 Filter")
        ranking = self.top_k_filter(queries=queries, ranking=ranking)
        print("Cross Encoding")
        ranking = self.cross_encoder(queries=queries, ranking=ranking)
        print("Applying final top k filter")
        ranking = self.final_top_k_filter(queries=queries, ranking=ranking)
        return ranking

In [5]:
from src.runner import Runner
from datetime import datetime

print(datetime.now())
runner = Runner(SBertPipelineWithCrossEncoderWithoutTags, n=1, data_reader=data_reader, topic_reader=topic_reader)
ranking = runner.run("../results/model_results/SBert-Cross-Encoder-Without-Tags.tsv")
print(datetime.now())
ranking

2022-11-18 08:45:28.831178
Removing Tags




Bi-Encoding
read from cached embeddings at  ../arqmath_dataset/model_embeddings/document_embeddings_all-MiniLM-L6-v2.npy


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Similarities
Retrieving Answers
Applying Top K=5000 Filter
Cross Encoding


Batches:   0%|          | 0/15625 [00:00<?, ?it/s]

Applying final top k filter
2022-11-18 10:07:10.797290


Unnamed: 0,Topic_Id,Post_Id,Score,Run_Number,Rank
0,A.301,1777715,1.985368,0,0
1,A.301,534977,1.251464,0,1
2,A.301,1448576,0.414236,0,2
3,A.301,2616416,0.160473,0,3
4,A.301,1237404,0.004756,0,4
...,...,...,...,...,...
99995,A.400,571968,-9.499552,0,995
99996,A.400,2554379,-9.500296,0,996
99997,A.400,2123419,-9.501735,0,997
99998,A.400,2677352,-9.502047,0,998
