# PyTerrier retrieval

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from pandas import DataFrame

In [2]:
from src import init_data
topic_reader, data_reader = init_data(task=1)

reading users
reading comments
reading votes
reading post links
reading posts


In [3]:
queries = list(topic_reader.map_topics.values())[0:5]
documents = data_reader.get_all_answer_posts()

In [11]:
from bs4 import BeautifulSoup

for document in documents:
    document.body = BeautifulSoup(document.body).text

In [13]:
documents[1].body

"You use a proof by contradiction. Basically, you suppose that \\sqrt{2} can be written as p/q. Then you know that 2q^2 = p^2. However, both q^2 and p^2 have an even number of factors of two, so 2q^2 has an odd number of factors of 2, which means it can't be equal to p^2. "

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [22]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_word_set = set(stopwords.words('english'))

In [25]:
for document in documents:
    text_tokens = word_tokenize(document.body)
    tokens_without_sw = [word for word in text_tokens if not word in stop_word_set]
    filtered_sentence = (" ").join(tokens_without_sw)
    document.body = filtered_sentence

In [31]:
documents[1].body

"You use proof contradiction . Basically , suppose \\sqrt { 2 } written p/q . Then know 2q^2 = p^2 . However , q^2 p^2 even number factors two , 2q^2 odd number factors 2 , means ca n't equal p^2 ."

In [32]:
collection: DataFrame = DataFrame([document.body for document in documents], columns=["text"])
count_vectorizer = CountVectorizer(max_df=0.90, min_df=2, stop_words="english", lowercase=True)
document_term_matrix = count_vectorizer.fit_transform(collection["text"])

In [33]:
document_term_matrix

<1445495x170326 sparse matrix of type '<class 'numpy.int64'>'
	with 43009338 stored elements in Compressed Sparse Row format>

In [34]:
count_vectorizer.get_feature_names_out()

array(['00', '000', '0000', ..., 'ﬂoer', 'ﬂow', 'ﬂuid'], dtype=object)

In [37]:
lda = LatentDirichletAllocation(n_components=10, random_state=2, n_jobs=-1)
document_topics = lda.fit_transform(document_term_matrix)

In [38]:
print(document_topics.shape)

(1445495, 10)


In [50]:
for topic in queries:
    topic.question = BeautifulSoup(topic.question).text

In [51]:
for topic in queries:
    text_tokens = word_tokenize(topic.question)
    tokens_without_sw = [word for word in text_tokens if not word in stop_word_set]
    filtered_sentence = (" ").join(tokens_without_sw)
    topic.question = filtered_sentence

In [52]:
queries[2].question

"Theorem- Up isomorphism , noncommutative Lie algebra dimension 2 basis $ x , $ bracket determined $ [ x , ] = x $ . I understand vector spaces dimension 2 field $ K $ isomorphic . So number lie algebras dimension 2 field $ K $ determined number possible bilinear operations [ ] $ : \\ V \\ X \\ V \\rightarrow V $ satisfying conditions $ ) $ $ [ x , x ] =0 $ $ x\\in V $ $ b ) $ $ [ x , [ , z ] ] + [ , [ z , x ] ] + [ z , [ x , ] ] =0 $ $ x , , z \\in V $ The bilinear operations hand determined elements pair base elements mapped bilinear operation . And since lie algebra $ [ x , x ] = [ , ] =0 $ $ [ x , ] =- [ x , ] $ ony need determine $ [ x , ] $ . Now prove $ [ x , ] =x $ $ [ , x ] =-x $ always ca n't [ , x ] =y vector ?"

In [57]:
query_frame: DataFrame = DataFrame([topic.question for topic in queries], columns=["text"])
query_term_matrix = count_vectorizer.transform(query_frame["text"])
query_term_matrix

<5x170326 sparse matrix of type '<class 'numpy.int64'>'
	with 167 stored elements in Compressed Sparse Row format>

In [58]:
query_topics = lda.transform(query_term_matrix)
print(query_topics.shape)

(5, 10)


In [59]:
cos_sims: np.ndarray = cosine_similarity(query_topics, document_topics)
print(cos_sims.shape)

(5, 1445495)


In [60]:
cos_sims

array([[0.3493219 , 0.37338611, 0.29517672, ..., 0.48512244, 0.0967937 ,
        0.04478709],
       [0.49077973, 0.84159973, 0.41985897, ..., 0.12774375, 0.025076  ,
        0.13921919],
       [0.45547321, 0.01332057, 0.53542274, ..., 0.65988109, 0.01232642,
        0.08817407],
       [0.96342296, 0.00967005, 0.85189752, ..., 0.14006342, 0.1231762 ,
        0.00942064],
       [0.75154976, 0.01389667, 0.64722963, ..., 0.01248457, 0.12543602,
        0.62064551]])

In [None]:
result = []
for i, query in enumerate(queries):
   per_query = list(zip(range(cos_sims.shape[1]), cos_sims[i,]))
   print(per_query)
   for j in per_query:
       result.append((query, documents[j[0]], j[1]))

In [72]:
for i, query in enumerate(queries):
   per_query = list(zip(range(cos_sims.shape[1]), cos_sims[i,]))
   print(per_query[0:15])
   print("--------")

[(0, 0.3493218997081761), (1, 0.3733861107209012), (2, 0.2951767221656128), (3, 0.24473222932434932), (4, 0.26441212813020604), (5, 0.3512625705970236), (6, 0.47372265494681376), (7, 0.5298696610068077), (8, 0.03258188170305733), (9, 0.41138828528560184), (10, 0.3714948318224958), (11, 0.07534578092771996), (12, 0.3811565129037752), (13, 0.3569050843826427), (14, 0.42483115285959505)]
--------
[(0, 0.49077973134598635), (1, 0.8415997346755468), (2, 0.41985897406443146), (3, 0.34784983713436457), (4, 0.37323239439289263), (5, 0.4954953954155038), (6, 0.8381238205433654), (7, 0.5639275304248651), (8, 0.031204747769877864), (9, 0.5031950094290649), (10, 0.11652081823308097), (11, 0.06001703438253045), (12, 0.5189988519317928), (13, 0.5002850446385844), (14, 0.7745021338629381)]
--------
[(0, 0.45547321255973733), (1, 0.013320572661420515), (2, 0.5354227436716302), (3, 0.5586757361144622), (4, 0.5037997559042424), (5, 0.38190163991568904), (6, 0.00977179137583973), (7, 0.36911674804866434)

In [65]:
print(result[3])

(<arqmath_code.topic_file_reader.Topic object at 0x107b031f0>, <arqmath_code.Entities.Post.Answer object at 0x518fe5910>, 0.24473222932434932)
