## ***Topic Modelling Technique: Non-Negative Matrix Factorization***

In [2]:
import pandas as pd
from pylatexenc.latex2text import LatexNodes2Text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF


In [3]:
# Create LaTeX code
math_latex_text = r"""\text{According to the theorem for} \ \Theta \ \text{- Notation}, \begin{equation}f(n) + g(n) = \in \Theta(max\{f(n),g(n)\}) \ \text{such that}, 0 
\leq c_{1}max\{f(n),g(n)\} \leq f(n) + g(n) \leq c_{2}max\{f(n),g(n)\} \ \text{for all} \ n \geq n_{0}\end{equation}. \ \text{Which means for this statement to 
universally hold}, \begin{equation}f(n) + g(n) \in \Theta(max\{f(n),g(n)\}), \ \text{if and only if} \ f(n) + g(n) \in O(max\{f(n),g(n)\})\end{equation} \ \text{and} 
\ \begin{equation}f(n) + g(n) \in \Omega(max\{f(n),g(n)\})\end{equation}."""
print(math_latex_text)

\text{According to the theorem for} \ \Theta \ \text{- Notation}, \begin{equation}f(n) + g(n) = \in \Theta(max\{f(n),g(n)\}) \ \text{such that}, 0 
\leq c_{1}max\{f(n),g(n)\} \leq f(n) + g(n) \leq c_{2}max\{f(n),g(n)\} \ \text{for all} \ n \geq n_{0}\end{equation}. \ \text{Which means for this statement to 
universally hold}, \begin{equation}f(n) + g(n) \in \Theta(max\{f(n),g(n)\}), \ \text{if and only if} \ f(n) + g(n) \in O(max\{f(n),g(n)\})\end{equation} \ \text{and} 
\ \begin{equation}f(n) + g(n) \in \Omega(max\{f(n),g(n)\})\end{equation}.


In [4]:
# Convert LaTeX code to regular python unicode text
math_python_text = LatexNodes2Text().latex_to_text(math_latex_text)
print(math_python_text)

According to the theorem for  Θ - Notation, 
    f(n) + g(n) = ∈Θ(max{f(n),g(n)})  such that, 0 
    ≤ c_1max{f(n),g(n)}≤ f(n) + g(n) ≤ c_2max{f(n),g(n)} for all n ≥ n_0
.  Which means for this statement to 
universally hold, 
    f(n) + g(n) ∈Θ(max{f(n),g(n)}),  if and only if f(n) + g(n) ∈ O(max{f(n),g(n)})
  and 
 
    f(n) + g(n) ∈Ω(max{f(n),g(n)})
.


In [5]:
# Initialize stopwords
stop_words = set(stopwords.words('english'))

In [6]:
# Remove stopwords in math text
words_in_math_sentence = word_tokenize(math_python_text)
math_filtered_words = " ".join([word for word in words_in_math_sentence 
                  if word.lower() not in stop_words])
print(math_filtered_words)

According theorem Θ - Notation , f ( n ) + g ( n ) = ∈Θ ( max { f ( n ) , g ( n ) } ) , 0 ≤ c_1max { f ( n ) , g ( n ) } ≤ f ( n ) + g ( n ) ≤ c_2max { f ( n ) , g ( n ) } n ≥ n_0 . means statement universally hold , f ( n ) + g ( n ) ∈Θ ( max { f ( n ) , g ( n ) } ) , f ( n ) + g ( n ) ∈ ( max { f ( n ) , g ( n ) } ) f ( n ) + g ( n ) ∈Ω ( max { f ( n ) , g ( n ) } ) .


In [7]:
# Create regular python unicode text
omega_wolf_text = """Omega wolves are the lowest-ranking, most-submissive, and timid members of the group — at least according 
to older interpretations of wolf pack ranks. They were often thought to be the scapegoats or 'punching bags' of the pack, absorbing 
aggression from others and living on the outskirts."""
print(omega_wolf_text)

Omega wolves are the lowest-ranking, most-submissive, and timid members of the group — at least according 
to older interpretations of wolf pack ranks. They were often thought to be the scapegoats or 'punching bags' of the pack, absorbing 
aggression from others and living on the outskirts.


In [8]:
# Remove stopwords in omega text
words_in_omega_sentence = word_tokenize(omega_wolf_text)
omega_filtered_words = " ".join([word for word in words_in_omega_sentence 
                  if word.lower() not in stop_words])
print(omega_filtered_words)

Omega wolves lowest-ranking , most-submissive , timid members group — least according older interpretations wolf pack ranks . often thought scapegoats 'punching bags ' pack , absorbing aggression others living outskirts .


In [9]:
# Create The Word Dataset
cleaned_paragraph_array = [math_filtered_words, omega_filtered_words]
testing_dataset = {'cleaned_paragraphs': cleaned_paragraph_array}
text_df = pd.DataFrame(testing_dataset)
text_df

Unnamed: 0,cleaned_paragraphs
0,"According theorem Θ - Notation , f ( n ) + g (..."
1,"Omega wolves lowest-ranking , most-submissive ..."


In [10]:
# Create a Document-Term Matrix For The Dataset
count_vectorizer = CountVectorizer(max_df=0.95, min_df=1)
document_term_words_matrix = count_vectorizer.fit_transform(text_df["cleaned_paragraphs"])
document_term_words_matrix.todense()

matrix([[0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 4, 1, 0, 0, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0],
        [1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
         1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1]])

In [11]:
# Apply The Non-Negative Matrix Factorization
NMF_Model = NMF(n_components=2,random_state=25)
NMF_Model.fit(document_term_words_matrix)


0,1,2
,n_components,2
,init,
,solver,'cd'
,beta_loss,'frobenius'
,tol,0.0001
,max_iter,200
,random_state,25
,alpha_W,0.0
,alpha_H,'same'
,l1_ratio,0.0


In [12]:
W = NMF_Model.fit_transform(document_term_words_matrix)
H = NMF_Model.components_
print(f"The W Matrix:\n")
print(f"{W}\n")

print(f"The H Matrix:\n")
print(f"{H}\n")

The W Matrix:

[[0.         1.15525307]
 [1.03958516 0.        ]]

The H Matrix:

[[0.96192216 0.96192216 0.96192216 0.         0.         0.96192216
  0.         0.96192216 0.96192216 0.96192216 0.96192216 0.
  0.         0.96192216 0.96192216 0.         0.         0.96192216
  0.96192216 0.96192216 0.96192216 0.96192216 1.92384432 0.96192216
  0.96192216 0.96192216 0.96192216 0.         0.96192216 0.
  0.96192216 0.96192216 0.         0.96192216 0.96192216]
 [0.         0.         0.         0.8656112  0.8656112  0.
  0.8656112  0.         0.         0.         0.         3.46244482
  0.8656112  0.         0.         0.8656112  0.8656112  0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.8656112  0.         0.8656112
  0.         0.         0.8656112  0.         0.        ]]



In [13]:
# Create function that examines the top words in each topic
# to computationally understand what each topic represent.
def display_the_subject_matter(H, feature_names, num_top_words):
    for topic_id, topic in enumerate(H):
        print(f"Topic #{topic_id+1}:")
        print("    " + " ".join([feature_names[i] 
                        for i in topic.argsort()[:num_top_words - 1:-1]]))
    print("")

In [14]:
# Utilize the function to computationally interpret the topics
number_of_top_words = 25
feature_names_list = count_vectorizer.get_feature_names_out()
display_the_subject_matter(H,feature_names_list,number_of_top_words)

Topic #1:
    pack wolf wolves absorbing timid submissive thought ranking ranks punching
Topic #2:
    max theorem universally c_1max means notation statement hold c_2max n_0

