# Notebook for tf-idf information retrieval

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from src import init_data
from arqmath_code.topic_file_reader import Topic
from arqmath_code.Entities.Post import Post
import re

In [2]:
topic_reader, data_reader = init_data(task=1)

reading users
reading comments
reading votes
reading post links
reading posts


In [3]:
test_topic: Topic = topic_reader.get_topic('A.301')

## Remove html tags
As we are not interested in their word counts in the actual tf-idf vectors

In [4]:
test_topic.title = re.sub(r"</?(p|span)[^>]*>", "", test_topic.title)
test_topic.question = re.sub(r"</?(p|span)[^>]*>", "", test_topic.question)
test_topic.question

"Suppose $A$ is a $m\\times n$ matrix. Then Prove that, $\\begin{equation*} \\|A\\|_2\\leq \\sqrt{\\|A\\|_1 \\|A\\|_{\\infty}} \\end{equation*}$ I have proved the following relations: $\\begin{align*} \\frac{1}{\\sqrt{n}}\\|A\\|_{\\infty}\\leq \\|A\\|_2\\leq\\sqrt{m}\\|A\\|_{\\infty}\\\\ \\frac{1}{\\sqrt{m}}\\|A\\|_{1}\\leq \\|A\\|_2\\leq\\sqrt{n}\\|A\\|_{1} \\end{align*}$ Also I feel that somehow Holder's inequality for the special case when $p=1$ and $q=\\infty$ might be useful.But I couldn't prove that. Edit: I would like to have a prove that do not use the information that $\\|A\\|_2=\\sqrt{\\rho(A^TA)}$ Usage of inequalities like Cauchy Schwartz or Holder is fine. "

In [5]:
def remove_html_tags_from_post(post: Post) -> Post:
    post.body = re.sub(r"</?(p|span)[^>]*>", "", post.body)
    return post

## Retrieve Subset of Document Collection by tags
Equivalent to a binary retrieval of documents by only considering tags

In [6]:
potential_questions = [question for tag in test_topic.lst_tags for question in data_reader.get_question_of_tag(tag=tag)]
len(potential_questions)

134703

In [7]:
for question in potential_questions:
    question.title = re.sub(r"</?(p|span)[^>]*>", "", question.title)
    question = remove_html_tags_from_post(post=question)

potential_questions[0].body

"I was reading up on the Fibonacci Sequence,  \\text {{1,1,2,3,5,8,13,....}} when I've noticed some were able to calculate specific numbers. So far I've only figured out creating an array and counting to the value, which is incredibly simple, but I reckon I can't find any formula for calculating a Fibonacci number based on it's position.  Is there a way to do this? If so, how are we able to apply these formulas to arrays? "

## Create TF-IDF Vectors for Question Titles

In [8]:
# Basic example from sklearn
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
print(x)
vectorizer.get_feature_names_out()

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [9]:
potential_question_titles = [question.title for question in potential_questions]
potential_question_titles

['How are we able to calculate specific numbers in the Fibonacci Sequence?',
 'Why is the matrix-defined Cross Product of two 3D vectors always orthogonal?',
 'What Is An Inner Product Space?',
 'Calculating an Angle from 2 points in space',
 'Real world uses of Quaternions?',
 'Understanding Dot and Cross Product',
 'Recasting points from one vector space to another',
 'Why are 3D transformation matrices 4 \\times 4 instead of 3 \\times 3?',
 'Intuitive reasoning behind the Chain Rule in multiple variables?',
 'Can non-linear transformations be represented as Transformation Matrices?',
 'How can I tell which matrix decomposition to use for OLS?',
 'What is the most efficient way to determine if a matrix is invertible?',
 'Balance chemical equations without trial and error?',
 "What's an intuitive way to think about the determinant?",
 'Are there variations on least-squares approximations?',
 'How to visualize a rank-2 tensor?',
 'What is linear programming?',
 'Determinants and volume

In [10]:
training_set = potential_question_titles.copy()
training_set.append(test_topic.title)

vectorizer = TfidfVectorizer()
vectorizer.fit(training_set)
question_vector = vectorizer.transform([test_topic.title])
print(type(question_vector))
print(question_vector)
print(len(vectorizer.get_feature_names_out()))

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 10166)	0.11446821960392002
  (0, 9964)	0.8019740456643389
  (0, 9104)	0.20969552675584605
  (0, 7561)	0.36768842996473255
  (0, 7487)	0.22592426720406075
  (0, 2426)	0.2899564294028315
  (0, 1717)	0.17161339063840242
16014


In [12]:
potential_question_titles_tfidf = vectorizer.transform(potential_question_titles)
print(potential_question_titles_tfidf)

  (0, 15433)	0.2822514614177647
  (0, 14369)	0.14425964956099005
  (0, 14202)	0.11993950964863516
  (0, 13282)	0.3348006060575183
  (0, 12808)	0.32622630520529095
  (0, 10054)	0.28524509611453674
  (0, 7347)	0.16014506611683296
  (0, 7070)	0.1707947768204248
  (0, 6000)	0.4238500495505215
  (0, 2908)	0.29827571435293265
  (0, 1911)	0.2049798837242443
  (0, 1301)	0.46883541907636717
  (1, 15518)	0.27010149302520337
  (1, 15230)	0.25821128465534077
  (1, 14755)	0.25221064999935205
  (1, 14202)	0.1379094671268448
  (1, 11353)	0.2509348190524657
  (1, 10333)	0.2768951254927041
  (1, 10166)	0.1094207975578857
  (1, 9112)	0.13401143708727692
  (1, 7867)	0.1548331438696508
  (1, 4370)	0.36890365845180645
  (1, 4098)	0.4065935625885897
  (1, 1663)	0.3630441010345705
  (1, 654)	0.3885732319766841
  :	:
  (134700, 15566)	0.2915569467883864
  (134700, 15060)	0.23250539039904547
  (134700, 14259)	0.19451237933867363
  (134700, 12908)	0.19870039043675372
  (134700, 10166)	0.08943996238066884
  (134

## Compute Cosine Sim between query and tfidf-word-term-matrix

In [15]:
cos_sim = cosine_similarity(question_vector, potential_question_titles_tfidf)
print(type(cos_sim))
print(cos_sim)
cos_sim

<class 'numpy.ndarray'>
[[0.         0.0125252  0.         ... 0.01023803 0.06063566 0.16423073]]


array([[0.        , 0.0125252 , 0.        , ..., 0.01023803, 0.06063566,
        0.16423073]])

## Rank results
By sorting the cosine sims desc

In [19]:

ranking = sorted(zip(range(cos_sim.shape[1]), cos_sim[0,]), key=lambda tuple: tuple[1], reverse=True)
ranking

[(94646, 0.87244833072917),
 (132337, 0.87244833072917),
 (133667, 0.8164768795119595),
 (26495, 0.8160505307722377),
 (90114, 0.8160505307722377),
 (40583, 0.8158120909193725),
 (132573, 0.8158120909193725),
 (120432, 0.8095856950394776),
 (114101, 0.7583560456863047),
 (115099, 0.7583560456863047),
 (122487, 0.7583560456863047),
 (133481, 0.7564631834572546),
 (55650, 0.7563694662953386),
 (125547, 0.7563694662953386),
 (133373, 0.7563694662953386),
 (96893, 0.7366658335422877),
 (41978, 0.7161609601976903),
 (97286, 0.7161609601976903),
 (132666, 0.7161609601976903),
 (132329, 0.7106661383047207),
 (43876, 0.7102325648634575),
 (79829, 0.7102325648634575),
 (130823, 0.7102325648634575),
 (4124, 0.704674332748856),
 (44639, 0.704674332748856),
 (80395, 0.704674332748856),
 (85013, 0.704674332748856),
 (98365, 0.704674332748856),
 (105956, 0.704674332748856),
 (130871, 0.704674332748856),
 (133298, 0.704674332748856),
 (133698, 0.704674332748856),
 (9197, 0.7039835324486547),
 (16213,