# LDA study & extension

## Preprocessing

In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt

### Download Data


In [None]:
from sklearn.datasets import fetch_20newsgroups

#categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space' , ]  # Choose desired categories
#import without irrelevant/biasing metadata (footer & quotes)
data = fetch_20newsgroups(subset='train',  shuffle=True, random_state=42, remove=('footers', 'quotes'))
print("Number of documents:", len(data.data))

Number of documents: 11314


In [None]:
# Get the text data and target labels
documents = data.data
labels = data.target

# Print a few sample documents
num_samples = 5  # Number of sample documents to print

def printSampleDocuments(docs):
    for i in range(min(num_samples, len(docs))):
        print(f"Document {i+1}:")
        print(f"Category: {data.target_names[labels[i]]}")
        print(docs[i])
        print("---------------------------------------------------\n")

def printSampleMetadata(metadata):
    for metaList in metadata:
        print([name for name in globals() if globals()[name] is metaList][0] + ':')
        for i in range(min(num_samples, len(metaList))):
            print(metaList[i])
        print("\n")

printSampleDocuments(documents)

### Metadata Extraction - cleaned_documents



In [None]:
# Split document by double newlines, drop the first block, and combine again as a string
import re
cleaned_documents = []
subjects = []
keywords = []
organizations = []
for document in documents:
    blocks = re.split(r'\n\s*\n', document)
    subjects.append(" ".join(re.findall('Subject: (.*)\n', blocks[0]) + re.findall('Summary: (.*)\n', blocks[0])))
    keywords.append(" ".join(re.findall('Keywords: (.*)\n', blocks[0])))
    organizations.append(" ".join(re.findall('Organization: (.*)\n', blocks[0])))
    cleaned_document = '\n'.join(blocks[1:])  # Drop the first block
    cleaned_documents.append(cleaned_document)

printSampleDocuments(cleaned_documents)
printSampleMetadata([subjects, keywords, organizations])

### Removing Stopwords
Defined in advance to catch every form of possible stopwords

In [None]:
nltk.download('stopwords')

In [None]:
stopwords_list = stopwords.words('english')
custom = ["article", "writes", "entry", "date", "udel", "said", "tell", "think", "know", "just", "newsgroup",
                     "line", "like", "does", "going", "make", "thanks", "could", "would", "re", "use", "good", "get", "also", "hey"]
stopwords_list.extend(custom)
stopwords_list = set(stopwords_list)

In [None]:
def removeStopwords(documentList):
    stopword_removed_documents = []
    for tokens in documentList:
        filtered_tokens = [token for token in tokens if token not in stopwords_list]
        stopword_removed_documents.append(filtered_tokens)
    return stopword_removed_documents

### Tokenization - tokenized_documents


In [None]:
REMOVE_DIGITS = True
MIN_TOKEN_LEN = 3

import nltk
nltk.download('punkt')

In [None]:
def tokenizeTextList(textList, filterByLength = MIN_TOKEN_LEN):
    tokenizedTextList = []
    for text in textList:
        text = re.sub('[0-9]+', ' ', text) if REMOVE_DIGITS else text
        text = re.sub('[,./\'|]', ' ', text)
        tokens = nltk.word_tokenize(text)
        filtered_tokens = [token.lower() for token in tokens if len(token) >= filterByLength]
        tokenizedTextList.append(filtered_tokens)
    return tokenizedTextList

tokenized_documents = tokenizeTextList(cleaned_documents, MIN_TOKEN_LEN)
tokenized_subjects = tokenizeTextList(subjects, MIN_TOKEN_LEN)
tokenized_keywords = tokenizeTextList(keywords, 2) #to keep abbreviations
tokenized_organizations = tokenizeTextList(organizations, 2) #to keep abbreviations

tokenized_documents = removeStopwords(tokenized_documents)
tokenized_subjects = removeStopwords(tokenized_subjects)
tokenized_keywords = removeStopwords(tokenized_keywords)
tokenized_organizations = removeStopwords(tokenized_organizations)

printSampleDocuments(tokenized_documents)
printSampleMetadata([tokenized_subjects, tokenized_keywords, tokenized_organizations])

### Lemmatize tokens - lemmatized_documents


In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:

lemmatizer = WordNetLemmatizer()

def lemmatizeLists(list):
    lemmatized_list = []
    for tokens in list:
        pos_tags = nltk.pos_tag(tokens)
        lemmatized_tokens = []
        for token, pos_tag in pos_tags:
            # Map POS tag to WordNet POS tag
            wn_pos_tag = nltk.corpus.wordnet.NOUN
            if pos_tag.startswith('V'):
                wn_pos_tag = nltk.corpus.wordnet.VERB
            elif pos_tag.startswith('J'):
                wn_pos_tag = nltk.corpus.wordnet.ADJ
            elif pos_tag.startswith('R'):
                wn_pos_tag = nltk.corpus.wordnet.ADV
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos = wn_pos_tag))
        lemmatized_list.append(lemmatized_tokens)
    return lemmatized_list

lemmatized_documents = lemmatizeLists(tokenized_documents)
lemmatized_subjects = lemmatizeLists(tokenized_subjects)
lemmatized_keywords = lemmatizeLists(tokenized_keywords)
lemmatized_organizations = lemmatizeLists(tokenized_organizations)

lemmatized_documents = removeStopwords(lemmatized_documents)
lemmatized_subjects = removeStopwords(lemmatized_subjects)
lemmatized_keywords = removeStopwords(lemmatized_keywords)
lemmatized_organizations = removeStopwords(lemmatized_organizations)

In [None]:
printSampleDocuments(lemmatized_documents)
printSampleMetadata([lemmatized_subjects, lemmatized_keywords, lemmatized_organizations])

###Stemm tokens - stemmed_documents

In [None]:
from nltk.stem import PorterStemmer

In [None]:
stemmer = PorterStemmer()

def stemmTokenLists(listOfLists):
    stemmed_list = []
    for tokens in listOfLists:
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_list.append(stemmed_tokens)
    return stemmed_list

stemmed_documents = stemmTokenLists(lemmatized_documents)
stemmed_subjects = stemmTokenLists(lemmatized_subjects)
stemmed_keywords = stemmTokenLists(lemmatized_keywords)
stemmed_organizations = stemmTokenLists(lemmatized_organizations)

stemmed_documents = removeStopwords(stemmed_documents)
stemmed_subjects = removeStopwords(stemmed_subjects)
stemmed_keywords = removeStopwords(stemmed_keywords)
stemmed_organizations = removeStopwords(stemmed_organizations)

printSampleDocuments(stemmed_documents)
printSampleMetadata([stemmed_subjects, stemmed_keywords, stemmed_organizations])

In [None]:
n_documents = len(stemmed_documents)
print("Number of documents:", n_documents)

Number of documents: 11314


## Model Train and hLDA

In [None]:
import logging
from pprint import pprint
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Gridsearches

In [None]:

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# Create the LDA model
n_topics = 15  # Specify the number of topics
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline(steps=[
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lda', lda_model),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 1.0), #, 0.75),
    'vect__min_df': (1, 2, 0.01), #, 3, 5, 0.05),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2')
}

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

token_documents = [' '.join(tokens) for tokens in lemmatized_documents]
pprint(token_documents[:5])

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
t0 = time()
grid_search.fit(token_documents, data.target)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('lda', lda_model),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')
}

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
t0 = time()
grid_search.fit(token_documents, data.target)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
grid_search.cv_results_

### LDA & hLDA

In [None]:
HIERARCHICAL_LDA = True
INCLUDE_ORGANIZATIONS = False
HIERARCHICAL_WEIGHT = 5

if(HIERARCHICAL_LDA):
  for i in range(len(tokenized_documents)):
    tokenized_documents[i].extend(HIERARCHICAL_WEIGHT * (tokenized_subjects[i] + tokenized_keywords[i] + (tokenized_organizations[i] if INCLUDE_ORGANIZATIONS else [])))
    stemmed_documents[i].extend(HIERARCHICAL_WEIGHT * (stemmed_subjects[i] + stemmed_keywords[i] + (stemmed_organizations[i] if INCLUDE_ORGANIZATIONS else [])))
    lemmatized_documents[i].extend(HIERARCHICAL_WEIGHT * (lemmatized_subjects[i] + lemmatized_keywords[i] + (lemmatized_organizations[i] if INCLUDE_ORGANIZATIONS else [])))


#parameter obtained from previous grid-searches sampled on lemmatized_documents since search is time and power intense
count_vectorizer = CountVectorizer(min_df = 0.01, max_df = 0.5)
tfidf_vectorizer = TfidfVectorizer(min_df = 0.01, max_df = 0.5, norm = 'l2', use_idf = True)
#count_vector_docs = count_vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_documents])
count_vector_lemm_docs = count_vectorizer.fit_transform([' '.join(tokens) for tokens in lemmatized_documents])
#count_vector_stemmed_docs = count_vectorizer.fit_transform([' '.join(tokens) for tokens in stemmed_documents])
#tfidf_vector_docs = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_documents])
tfidf_vector_lemm_docs = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in lemmatized_documents])
#tfidf_vector_stemmed_docs = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in stemmed_documents])

In [None]:
print(len(tokenized_documents))

11314


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Create the LDA model
n_topics = 15
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method="online")
# Fit the LDA model to the vectorized documents
output = lda_model.fit_transform(count_vector_lemm_docs)
#output = lda_model.fit_transform(tfidf_vector_lemm_docs)

In [None]:
print(output.shape)
print(count_vector_lemm_docs.shape)
#print(tfidf_vector_lemm_docs.shape)
print(lda_model.components_[:15])

## Show Topics

### topic-word distribution

In [None]:
n_top_words = 10
feature_names = count_vectorizer.get_feature_names_out()
#feature_names = tfidf_vectorizer.get_feature_names_out()

topic_top_words = []

for topic_idx, topic in enumerate(lda_model.components_):
    top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    topic_top_words.append(top_words)
    print(f"Topic #{topic_idx}: {', '.join(top_words)}")

In [None]:
# Infer topic distribution for a document
document_index = 0
document_topic_distribution = lda_model.transform(count_vector_lemm_docs)
#document_topic_distribution = lda_model.transform(tfidf_vector_lemm_docs)
print(f"Topic distribution for Document #{document_index}:")
print(document_topic_distribution.shape)

### category topic matching

In [None]:
dominant_topics = [np.argmax(single_distribution) for single_distribution in document_topic_distribution]
print(dominant_topics)

[14, 14, 14, 13, 0, 6, 3, 12, 1, 1, 14, 6, 0, 4, 9, 9, 5, 14, 6, 1, 5, 12, 4, 1, 12, 4, 0, 0, 5, 14, 14, 14, 0, 3, 1, 5, 5, 5, 6, 2, 5, 4, 13, 14, 5, 1, 4, 14, 5, 3, 7, 8, 7, 5, 3, 13, 14, 9, 1, 0, 4, 10, 7, 9, 4, 5, 4, 3, 5, 9, 3, 2, 14, 14, 7, 7, 13, 14, 1, 7, 11, 10, 4, 4, 14, 11, 6, 6, 6, 0, 1, 3, 7, 1, 0, 14, 4, 2, 8, 4, 1, 5, 14, 7, 3, 6, 14, 14, 14, 9, 5, 7, 7, 5, 12, 5, 5, 13, 1, 0, 8, 14, 7, 8, 8, 5, 10, 5, 14, 5, 9, 3, 9, 3, 0, 8, 5, 7, 4, 5, 5, 12, 7, 14, 0, 5, 1, 3, 8, 3, 6, 6, 5, 0, 7, 4, 14, 8, 5, 5, 3, 2, 11, 14, 0, 7, 9, 1, 4, 14, 5, 0, 12, 1, 12, 7, 0, 6, 5, 14, 5, 14, 5, 5, 7, 12, 14, 4, 13, 4, 7, 4, 13, 5, 7, 7, 14, 5, 6, 7, 2, 14, 5, 5, 13, 14, 7, 0, 14, 5, 5, 0, 2, 13, 7, 5, 7, 7, 5, 5, 5, 5, 0, 6, 3, 11, 2, 5, 2, 6, 5, 1, 14, 14, 1, 1, 14, 5, 0, 11, 5, 8, 7, 14, 13, 13, 6, 5, 8, 4, 4, 2, 5, 3, 5, 1, 12, 14, 12, 7, 5, 3, 14, 14, 6, 4, 12, 3, 0, 1, 7, 1, 5, 1, 14, 4, 7, 1, 0, 13, 7, 7, 5, 9, 1, 11, 5, 5, 5, 7, 1, 2, 0, 6, 14, 8, 14, 14, 0, 3, 14, 13, 7, 5, 14, 6, 11

In [None]:
labels = data.target
print(labels[:15])
print(data.target_names)
#print(document_topic_distribution[:15])
topic_coherence = {'alt.atheism': [15], 'comp.graphics': [15], 'comp.os.ms-windows.misc': [15], 'comp.sys.ibm.pc.hardware': [15], 'comp.sys.mac.hardware': [15], 'comp.windows.x': [15], 'misc.forsale': [15], 'rec.autos': [15], 'rec.motorcycles': [15], 'rec.sport.baseball': [15], 'rec.sport.hockey': [15], 'sci.crypt': [15], 'sci.electronics': [15], 'sci.med': [15], 'sci.space': [15], 'soc.religion.christian': [15], 'talk.politics.guns': [15], 'talk.politics.mideast': [15], 'talk.politics.misc': [15], 'talk.religion.misc': [15]}
for index, single_distribution in enumerate(document_topic_distribution):
    topic_coherence[data.target_names[labels[index]]] += single_distribution
#print(topic_coherence)
for topic in topic_coherence:
    topic_id = np.argmax(topic_coherence[topic])
    print(f"{topic}: Topic #{topic_id} ({', '.join(topic_top_words[topic_id])})")

# Correlated Model


In [None]:
import numpy as np
import scipy.special as sps

In [None]:
pip install pyvis


In [None]:
pip install tomotopy


In [None]:
import tomotopy as tp
import nltk
from nltk.corpus import stopwords
import re
from sklearn.datasets import fetch_20newsgroups
from pyvis.network import Network

In [None]:
#porter_stemmer = nltk.PorterStemmer().stem
pat = re.compile('^[a-z]{4,}$')
#corpus = tp.utils.Corpus(
 #      tokenizer=tp.utils.SimpleTokenizer(),
  #    stopwords=lambda x: x in stopwords_list or not pat.match(x))
corpus = tp.utils.Corpus( stopwords=lambda x:  not pat.match(x))
for document in lemmatized_documents:
    corpus.add_doc(document)

#model = tp.LDAModel(k=5, corpus=corpus)
#model.train(100)
corpus.save('preprocessed_20news.cps')

In [None]:
mdl = tp.CTModel(min_df=5, k=20, corpus=corpus)
mdl.train(0)

In [None]:
#mdl.num_beta_sample = 5
print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
   len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)


In [None]:
for i in range(0, 10):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(10)
print('Iteration: {:04}, LL per word: {:.4}'.format(320, mdl.ll_per_word))

In [None]:
mdl.summary()

In [None]:
extractor = tp.label.PMIExtractor( max_len=10, max_cand=100)
cands = extractor.extract(mdl)
labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2)
for k in range(mdl.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in mdl.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

In [None]:
mdl.extract

In [None]:
mdl.get_correlations(topic_id=8)

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(10, 8))
corr =  mdl.get_correlations()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)