In [None]:
# Topic Modeling Demo for CHAOSS Augur Message Data

 #  1. Environment and Dependencies
import pandas as pd
import psycopg2
import spacy
import gensim
import gensim.corpora as corpora
from nltk.corpus import stopwords
from gensim.models import HdpModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
#  2. Database connection configuration (localhost, user=augur, default port)
conn = psycopg2.connect(
    dbname="augur",
    user="augur",
    password="augur",
    host="localhost",
    port="5432"
)

In [None]:
#  3. Data extraction: Get messages from message table with repo_id=24441
sql = """
SELECT msg_id, msg_text, msg_timestamp 
FROM augur_data.message
WHERE repo_id = 24441
  AND LENGTH(msg_text) > 30
  AND msg_timestamp BETWEEN '2021-08-03' AND '2023-12-31'
"""

messages_df = pd.read_sql_query(sql, conn)
print(f"Total messages retrieved: {len(messages_df)}")

Total messages retrieved: 43


In [None]:
#  4. Text preprocessing: spaCy tokenization + stopword removal
txts = messages_df['msg_text'].astype(str).tolist()
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Basic preprocessing function
def preprocess(text):
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in stop_words and len(token) > 2]

texts_tokenized = [preprocess(t) for t in txts]

In [None]:
#  5. Build Gensim dictionary and corpus
id2word = corpora.Dictionary(texts_tokenized)
corpus = [id2word.doc2bow(text) for text in texts_tokenized]

#  6. Train HDP model to automatically infer the number of topics
print("\nTraining HDP model...")
hdp_model = HdpModel(corpus=corpus, id2word=id2word)


Training HDP model...


In [None]:
#  7. Print the top topics and keywords
hdp_topics = hdp_model.show_topics(num_topics=10, num_words=10, formatted=False)
print("\nTop 10 topics and keywords:")
for topic_id, words in hdp_topics:
    print(f"\nTopic {topic_id}")
    print([word for word, _ in words])


Top 10 topics and keywords:

Topic 0
['first', 'operate', 'pull', 'blob', 'review', 'locally', 'write', 'full', 'code', 'approval']

Topic 1
['retweet', 'event', 'detail', 'please', 'guide', 'new', 'check', 'complete', 'octo', 'solution']

Topic 2
['approve', 'trigger', 'open', 'btw', 'link', 'share', 'add', 'approval', 'issue', 'tweet']

Topic 3
['michaelclifford', 'assign', 'inactivity', 'seem', 'approve', 'full', 'wonder', 'since', 'documentation', 'requirement']

Topic 4
['great', 'additional', 'well', 'add', 'review', 'ready', 'thank', 'open', 'requirement', 'lifecycle']

Topic 5
['qualify', 'review', 'inactivity', 'full', 'opinion', 'proof', 'hence', 'process', 'oindrillac', 'perfect']

Topic 6
['plan', 'bot', 'detail', 'complete', 'quote', 'another', 'lifecycle', 'add', 'blob', 'approvalnotifier']

Topic 7
['version', 'complete', 'bypass', 'well', 'safe', 'please', 'stale', 'operate', 'approval', 'remove']

Topic 8
['cancel', 'michaelclifford', 'solution', 'idea', 'prow', 'desc

In [None]:
#  8. Visualize using pyLDAvis
print("\nPreparing visualization...")
vis_data = gensimvis.prepare(hdp_model, corpus, id2word)
pyLDAvis.display(vis_data)

# Optionally save as HTML for later viewing
pyLDAvis.save_html(vis_data, "notebooks/topic_modeling/output/hdp_topics_augur.html")
print("✔️ Visualization saved as hdp_topics_augur.html")


Preparing visualization...
✔️ Visualization saved as hdp_topics_augur.html


In [None]:
# Assign dominant topic to each message
doc_topics = [max(hdp_model[doc], key=lambda x: x[1])[0] if hdp_model[doc] else -1 for doc in corpus]
messages_df['dominant_topic'] = doc_topics

# Show 2 representative comments for each topic
representative_samples = messages_df.groupby('dominant_topic').apply(lambda x: x.sample(min(2, len(x)))).reset_index(drop=True)
representative_samples[['dominant_topic', 'msg_timestamp', 'msg_text']]

Unnamed: 0,dominant_topic,msg_timestamp,msg_text
0,0,2021-11-04 13:33:03,[APPROVALNOTIFIER] This PR is **APPROVED**\n\n...
1,0,2022-04-22 20:18:18,[APPROVALNOTIFIER] This PR is **APPROVED**\n\n...
2,1,2022-03-11 20:12:43,/lgtm 👍 \r\n/hold\r\n@schwesig do you want to ...
3,2,2021-10-11 15:28:44,> What about we also link to the https://githu...
4,2,2022-01-28 14:13:27,for documentation purposes - https://github.co...
5,4,2022-03-11 22:43:14,"@oindrillac great tweet, thank you for making ..."
6,15,2021-10-16 15:01:08,Issues go stale after 90d of inactivity.\nMark...
7,15,2021-10-26 18:04:43,Issues go stale after 90d of inactivity.\nMark...
8,128,2022-03-11 22:07:33,@oindrillac Please change and do what is neede...
