# Evaluation Example
This notebook will show of how to evaluate a model by evaluating a simple TF-IDF Model based on scikit learn

## TF-IDF Test model

In [1]:
import scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from src import init_data
from arqmath_code.topic_file_reader import Topic
from arqmath_code.Entities.Post import Post
from typing import List
import re

topic_reader, data_reader = init_data(task=1)

reading users
reading comments
reading votes
reading post links
reading posts


In [2]:
def title_tf_idf_model(query: Topic, answers: List[Post]) -> List[tuple[int, float]]:
    question_titles: List[str] = [question.title for question in answers]
    training_set: List[str] = question_titles.copy()
    training_set.append(query.title)

    vectorizer: TfidfVectorizer = TfidfVectorizer()
    vectorizer.fit(training_set)
    query_vector: scipy.sparse_csr.csr_matrix = vectorizer.transform([query.title])
    word_term_matrix: scipy.sparse_csr.csr_matrix = vectorizer.transform(question_titles)
    cos_sims: np.ndarray = cosine_similarity(query_vector, word_term_matrix)
    ranking: List[tuple[int, float]] = sorted(zip(range(cos_sims.shape[1]), cos_sims[0,]), key=lambda tuple: tuple[1], reverse=True)[:1000]
    return ranking

def binary_tag_retrieval(query: Topic) -> List[Post]:
    return [question for tag in query.lst_tags for question in data_reader.get_question_of_tag(tag=tag)]

def clean_post(query: Topic, answers: List[Post]) -> (Topic, List[Post]):
    query.title = re.sub(r"</?(p|span)[^>]*>", "", query.title)
    query.question = re.sub(r"</?(p|span)[^>]*>", "", query.question)
    for answer in answers:
        answer.body = re.sub(r"</?(p|span)[^>]*>", "", answer.body)
        answer.title = re.sub(r"</?(p|span)[^>]*>", "", answer.title)
    return query, answers

In [3]:
# retrieve Ranking
test_topic: Topic = topic_reader.get_topic('A.301')
answers: List[Post] = binary_tag_retrieval(test_topic)
test_topic, answers = clean_post(test_topic, answers=answers)
title_tf_idf_model(query=test_topic, answers=answers)

[(94646, 0.87244833072917),
 (132337, 0.87244833072917),
 (133667, 0.8164768795119595),
 (26495, 0.8160505307722377),
 (90114, 0.8160505307722377),
 (40583, 0.8158120909193725),
 (132573, 0.8158120909193725),
 (120432, 0.8095856950394776),
 (114101, 0.7583560456863047),
 (115099, 0.7583560456863047),
 (122487, 0.7583560456863047),
 (133481, 0.7564631834572546),
 (55650, 0.7563694662953386),
 (125547, 0.7563694662953386),
 (133373, 0.7563694662953386),
 (96893, 0.7366658335422877),
 (41978, 0.7161609601976903),
 (97286, 0.7161609601976903),
 (132666, 0.7161609601976903),
 (132329, 0.7106661383047207),
 (43876, 0.7102325648634575),
 (79829, 0.7102325648634575),
 (130823, 0.7102325648634575),
 (4124, 0.704674332748856),
 (44639, 0.704674332748856),
 (80395, 0.704674332748856),
 (85013, 0.704674332748856),
 (98365, 0.704674332748856),
 (105956, 0.704674332748856),
 (130871, 0.704674332748856),
 (133298, 0.704674332748856),
 (133698, 0.704674332748856),
 (9197, 0.7039835324486547),
 (16213,