# Base example

In [1]:
from typing import List, Union, Tuple

from arqmath_code.Entities.Post import Question, Answer
from arqmath_code.topic_file_reader import Topic
from src.base.model import Model
from src.base.pipeline import Pipeline
from src.base.post_processor import PostProcessor
from src.base.pre_processing import PreProcessor
from src.runner import Runner

## Create a model

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Union, Tuple

class TfIdfModel(Model):
    def forward(self, queries: List[Topic], documents: List[Union[Question, Answer]]) -> List[
        Tuple[Topic, Union[Question, Answer], float]]:
        answer_bodys: List[str] = [answer.body for answer in documents]
        query_bodys: List[str] = [query.question for query in queries]

        training_set: List[str] = answer_bodys.copy()
        training_set = training_set + query_bodys

        vectorizer: TfidfVectorizer = TfidfVectorizer()
        vectorizer.fit(training_set)
        query_vector: scipy.sparse_csr.csr_matrix = vectorizer.transform(query_bodys)
        word_term_matrix: scipy.sparse_csr.csr_matrix = vectorizer.transform(answer_bodys)
        cos_sims: np.ndarray = cosine_similarity(query_vector, word_term_matrix)
        res = []
        for i, query in enumerate(queries):
           per_query = list(zip(range(cos_sims.shape[1]), cos_sims[0,]))
           for j in per_query:
               res.append((query, documents[j[0]], j[1]))
        return res

## Pre-Processor example

In [3]:
import re
class BinaryTagRetrieval(PreProcessor): # only works with a single topic
    def forward(self, queries: List[Topic], documents: List[Union[Question, Answer]]) -> List[Union[Question, Answer]]:
        questions = [question for question in documents if len(set(question.tags).intersection(set(queries[0].lst_tags))) > 0]
        questions = filter(lambda question: question.answers is not None, questions)
        return [answer for single_question in questions for answer in single_question.answers]

# Post-Processor example

In [4]:
class Top1000Filter(PostProcessor): # only works with a single topic
    def forward(self, queries: List[Topic], ranking: List[Tuple[Topic, Union[Question, Answer], float]]) -> List[
        Tuple[Topic, Union[Question, Answer], float]]:
        return sorted(ranking, key=lambda tuple: tuple[2], reverse=True)[:1000]

## Pipeline example

In [5]:
def clean_post(query: Topic, answers: List[Answer]) -> (Topic, List[Answer]):
    query.title = re.sub(r"</?(p|span)[^>]*>", "", query.title)
    query.question = re.sub(r"</?(p|span)[^>]*>", "", query.question)
    for answer in answers:
        answer.body = re.sub(r"</?(p|span)[^>]*>", "", answer.body)
    return query, answers

In [6]:
from arqmath_code.post_reader_record import DataReaderRecord


class TestPipeline(Pipeline):

    def __init__(self, data_reader: DataReaderRecord):
        super().__init__(data_reader)
        self.tf_idf = TfIdfModel()
        self.bin_tag = BinaryTagRetrieval()
        self.top1000 = Top1000Filter()

    def run(self, queries: List[Topic]) -> List[Tuple[Topic, Answer, float]]:
        query = queries[0]
        queries = [query]
        all_questions = self.data_reader.get_questions()
        documents = self.bin_tag(queries=queries, documents=all_questions)
        query, documents = clean_post(query=query, answers=documents)
        queries = [query]
        ranking = self.tf_idf(queries=queries, documents=documents)
        ranking = self.top1000(queries=queries, ranking=ranking)
        return ranking


## Runner Usage

In [None]:
runner = Runner(TestPipeline, n=1)
ranking = runner.run("../results/test2.tsv")
ranking

reading users
reading comments
reading votes
reading post links
reading posts
