## ***Testing Phase: Cosine Similarity***

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pylatexenc.latex2text import LatexNodes2Text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
# Create LaTeX code
math_latex_text = r"""\text{According to the theorem for} \ \Theta \ \text{- Notation}, \begin{equation}f(n) + g(n) = \in \Theta(max\{f(n),g(n)\}) \ \text{such that}, 0 
\leq c_{1}max\{f(n),g(n)\} \leq f(n) + g(n) \leq c_{2}max\{f(n),g(n)\} \ \text{for all} \ n \geq n_{0}\end{equation}. \ \text{Which means for this statement to 
universally hold}, \begin{equation}f(n) + g(n) \in \Theta(max\{f(n),g(n)\}), \ \text{if and only if} \ f(n) + g(n) \in O(max\{f(n),g(n)\})\end{equation} \ \text{and} 
\ \begin{equation}f(n) + g(n) \in \Omega(max\{f(n),g(n)\})\end{equation}."""
print(math_latex_text)

\text{According to the theorem for} \ \Theta \ \text{- Notation}, \begin{equation}f(n) + g(n) = \in \Theta(max\{f(n),g(n)\}) \ \text{such that}, 0 
\leq c_{1}max\{f(n),g(n)\} \leq f(n) + g(n) \leq c_{2}max\{f(n),g(n)\} \ \text{for all} \ n \geq n_{0}\end{equation}. \ \text{Which means for this statement to 
universally hold}, \begin{equation}f(n) + g(n) \in \Theta(max\{f(n),g(n)\}), \ \text{if and only if} \ f(n) + g(n) \in O(max\{f(n),g(n)\})\end{equation} \ \text{and} 
\ \begin{equation}f(n) + g(n) \in \Omega(max\{f(n),g(n)\})\end{equation}.


In [4]:
# Convert LaTeX code to regular python unicode text
math_python_text = LatexNodes2Text().latex_to_text(math_latex_text)
print(math_python_text)

According to the theorem for  Θ - Notation, 
    f(n) + g(n) = ∈Θ(max{f(n),g(n)})  such that, 0 
    ≤ c_1max{f(n),g(n)}≤ f(n) + g(n) ≤ c_2max{f(n),g(n)} for all n ≥ n_0
.  Which means for this statement to 
universally hold, 
    f(n) + g(n) ∈Θ(max{f(n),g(n)}),  if and only if f(n) + g(n) ∈ O(max{f(n),g(n)})
  and 
 
    f(n) + g(n) ∈Ω(max{f(n),g(n)})
.


In [5]:
# Create regular Python unicode text
omega_wolf_text = """Omega wolves are the lowest-ranking, most-submissive, and timid members of the group — at least according 
to older interpretations of wolf pack ranks. They were often thought to be the scapegoats or 'punching bags' of the pack, absorbing 
aggression from others and living on the outskirts."""
print(omega_wolf_text)

Omega wolves are the lowest-ranking, most-submissive, and timid members of the group — at least according 
to older interpretations of wolf pack ranks. They were often thought to be the scapegoats or 'punching bags' of the pack, absorbing 
aggression from others and living on the outskirts.


In [6]:
# Initialize stopwords
stop_words = set(stopwords.words('english'))

In [7]:
# Remove stopwords in math text
words_in_math_sentence = word_tokenize(math_python_text)
math_filtered_words = " ".join([word.lower() for word in words_in_math_sentence 
                  if word.lower() not in stop_words])
print(math_filtered_words)

according theorem θ - notation , f ( n ) + g ( n ) = ∈θ ( max { f ( n ) , g ( n ) } ) , 0 ≤ c_1max { f ( n ) , g ( n ) } ≤ f ( n ) + g ( n ) ≤ c_2max { f ( n ) , g ( n ) } n ≥ n_0 . means statement universally hold , f ( n ) + g ( n ) ∈θ ( max { f ( n ) , g ( n ) } ) , f ( n ) + g ( n ) ∈ ( max { f ( n ) , g ( n ) } ) f ( n ) + g ( n ) ∈ω ( max { f ( n ) , g ( n ) } ) .


In [8]:
# Remove stopwords in omega wolf text
words_in_omega_sentence = word_tokenize(omega_wolf_text)
omega_filtered_words = " ".join([word for word in words_in_omega_sentence 
                  if word.lower() not in stop_words])
print(omega_filtered_words)

Omega wolves lowest-ranking , most-submissive , timid members group — least according older interpretations wolf pack ranks . often thought scapegoats 'punching bags ' pack , absorbing aggression others living outskirts .


In [9]:
# Create a Document-Term Matrix For The Dataset
textual_contents = [math_filtered_words, omega_filtered_words]
count_vectorizer = CountVectorizer()
input_matrix = count_vectorizer.fit_transform(textual_contents)
document_term_matrix = input_matrix.todense()
document_term_matrix

matrix([[0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 4, 1, 0, 0, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0],
        [1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
         1, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1]])

In [10]:
# Create a DataFrame For The Dataset
test_df = pd.DataFrame(document_term_matrix, 
                       columns=count_vectorizer.get_feature_names_out(),
                       index=["Math_Text","Omega_Text"])
test_df

Unnamed: 0,absorbing,according,aggression,bags,c_1max,c_2max,group,hold,interpretations,least,...,ranks,scapegoats,statement,submissive,theorem,thought,timid,universally,wolf,wolves
Math_Text,0,1,0,0,1,1,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
Omega_Text,1,1,1,1,0,0,1,0,1,1,...,1,1,0,1,0,1,1,0,1,1


In [11]:
print(test_df)
print(cosine_similarity(test_df,test_df))

            absorbing  according  aggression  bags  c_1max  c_2max  group  \
Math_Text           0          1           0     0       1       1      0   
Omega_Text          1          1           1     1       0       0      1   

            hold  interpretations  least  ...  ranks  scapegoats  statement  \
Math_Text      1                0      0  ...      0           0          1   
Omega_Text     0                1      1  ...      1           1          0   

            submissive  theorem  thought  timid  universally  wolf  wolves  
Math_Text            0        1        0      0            1     0       0  
Omega_Text           1        0        1      1            0     1       1  

[2 rows x 36 columns]
[[1.         0.03641785]
 [0.03641785 1.        ]]
