# Singular Value Decomposition (SVD)


In [1]:
from src.base.pipeline import Pipeline
from src.runner import Runner
from typing import Tuple, List
from arqmath_code.Entities.Post import Answer
from arqmath_code.topic_file_reader import Topic

## SVD pipeline

In [2]:
from src.pre_processors.nltk_tokenization_and_stopword_removal import NLTKTokenizationAndStopwordRemoval, \
    NLTKTokenizationAndStopwordRemovalForQueries
from src.pre_processors.remove_xml_tags import RemoveXMLTagsFromDocumentBody, RemoveXMLTagsFromQueries
from src.latent.single_value_decomposition import SingleValueDecompositionModel
from src.post_processors.top_k_filter import TopKFilter
from arqmath_code.post_reader_record import DataReaderRecord


class SVDPipeline(Pipeline):

    def __init__(self, data_reader: DataReaderRecord):
        super().__init__(data_reader)
        self.svd_model = SingleValueDecompositionModel()
        self.top1000 = TopKFilter(k=1000)
        self.document_tag_remover = RemoveXMLTagsFromDocumentBody()
        self.document_tokenizer = NLTKTokenizationAndStopwordRemoval()
        self.query_tag_remover = RemoveXMLTagsFromQueries()
        self.query_tokenizer = NLTKTokenizationAndStopwordRemovalForQueries()

    def run(self, queries: List[Topic]) -> List[Tuple[Topic, Answer, float]]:
        all_answers = self.data_reader.get_all_answer_posts()

        print("Start document preprocessing")
        documents = self.document_tag_remover(queries, all_answers)
        documents = self.document_tokenizer(queries, documents)

        print("Start query preprocessing")
        queries = self.query_tag_remover(queries)
        queries = self.query_tokenizer(queries)

        print("Start ranking")
        ranking = self.svd_model(queries=queries, documents=documents)

        print("Start top 1000 filtering")
        ranking = self.top1000(queries=None, ranking=ranking)

        return ranking


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Run pipeline

In [7]:
from datetime import datetime
print(datetime.now())
runner = Runner(SVDPipeline, n=1)
ranking = runner.run("../results/model_results/svd.tsv")
print(datetime.now())
ranking

2022-11-24 04:18:32.260540
reading users
reading comments
reading votes
reading post links
reading posts
Start document preprocessing
Start query preprocessing
Start ranking
Finished count vectorizer
Finished SVD embedding
Start top 1000 filtering
2022-11-24 09:34:57.746370


Unnamed: 0,Topic_Id,Post_Id,Score,Run_Number,Rank
0,A.301,191021,0.850654,0,0
1,A.301,1308036,0.811295,0,1
2,A.301,250821,0.796495,0,2
3,A.301,733611,0.784887,0,3
4,A.301,323193,0.780250,0,4
...,...,...,...,...,...
99995,A.400,2450517,0.848617,0,995
99996,A.400,1789450,0.848611,0,996
99997,A.400,2872877,0.848602,0,997
99998,A.400,1890120,0.848561,0,998


## Evaluation

In [2]:
from arqmath_code.evaluation.task1 import arqmath_to_prime_task1
from arqmath_code.evaluation.task1 import task1_get_results

In [3]:
qrel_dictionary = arqmath_to_prime_task1.read_qrel_to_dictionary("../arqmath_dataset/evaluation/Task 1/Qrel Files/qrel_task1_2022_official.tsv")
arqmath_to_prime_task1.convert_result_files_to_trec(submission_dir="../results/model_results/", qrel_result_dic=qrel_dictionary, prim_dir="../results/ARQmath_prim/", trec_dir="../results/ARQmath_trec/")

In [None]:
number_topics = 78
task1_get_results.get_result(trec_eval_tool="trec_eval", qre_file_path="../arqmath_dataset/evaluation/Task 1/Qrel Files/qrel_task1_2022_official.tsv", prim_result_dir="../results/ARQmath_prim/", evaluation_result_file="../results/complete_results_v1.tsv", number_topics=number_topics)