In [17]:
import pandas as pd
# Load the file to inspect its structure
posts = pd.read_csv("posts_table.csv")
posts.head()

Unnamed: 0,post_id,user_id,full_text,created_At,lang,repost_count,like_count,source
0,1907578844636393679,1810361594586738692,"This is nasty \n\nNASDAQ futures down -4.4%, S...",Wed Apr 02 23:40:12 +0000 2025,en,0,0,Twitter Web App
1,1907578044492968060,2736639061,Investors Panic as U.S. Stock Market Plummets ...,Wed Apr 02 23:37:01 +0000 2025,en,1,0,Twitter Web App
2,1907575218501198232,1850383929829945344,For all the people who have absolutely no conc...,Wed Apr 02 23:25:47 +0000 2025,en,0,2,Twitter for Android
3,1907574739964510255,449290925,As someone who is feeling JVL levels of schade...,Wed Apr 02 23:23:53 +0000 2025,en,0,0,Twitter for Android
4,1907570533836701721,23709151,The fuzziest math is #TrumpTariff Math.,Wed Apr 02 23:14:59 +0000 2025,en,0,0,Twitter for Android


In [18]:
import re

def clean_text_preprocessing1(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)     # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)  # Removes mentions and hashtags
    text = re.sub(r"[^\x00-\x7F]+", "", text)               # Remove emojis (non-ASCII characters)
    text = re.sub(r"\d+", "", text)                         # Remove numerical characters
    return text

# Apply this to DataFrame 
posts['text_preprocessing1'] = posts['full_text'].apply(clean_text_preprocessing1)
posts['text_preprocessing1'].head()

0    This is nasty \n\nNASDAQ futures down -.%, S&a...
1    Investors Panic as U.S. Stock Market Plummets ...
2    For all the people who have absolutely no conc...
3    As someone who is feeling JVL levels of schade...
4                          The fuzziest math is  Math.
Name: text_preprocessing1, dtype: object

In [19]:
import string

def clean_text_preprocessing2(text):
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    return text

# Apply this to the result of step 1
posts['text_preprocessing_no_punctuation'] = posts['text_preprocessing1'].apply(clean_text_preprocessing2)
posts['text_preprocessing_no_punctuation'].head()

0    This is nasty \n\nNASDAQ futures down  SampP f...
1    Investors Panic as US Stock Market Plummets Ov...
2    For all the people who have absolutely no conc...
3    As someone who is feeling JVL levels of schade...
4                           The fuzziest math is  Math
Name: text_preprocessing_no_punctuation, dtype: object

In [20]:
# Convert texts to lowercase
posts['text_preprocessing_no_punctuation'] = \
posts['text_preprocessing_no_punctuation'].map(lambda x: x.lower())
posts['text_preprocessing_no_punctuation'].head()

0    this is nasty \n\nnasdaq futures down  sampp f...
1    investors panic as us stock market plummets ov...
2    for all the people who have absolutely no conc...
3    as someone who is feeling jvl levels of schade...
4                           the fuzziest math is  math
Name: text_preprocessing_no_punctuation, dtype: object

In [21]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_stopwords(text):
    words = text.split()
    filtered = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return " ".join(filtered)

# Apply to cleaned column
posts['text_no_stopwords'] = posts['text_preprocessing_no_punctuation'].apply(remove_stopwords)
posts['text_no_stopwords'].head()

0    nasty nasdaq futures sampp futures tariff news...
1    investors panic stock market plummets trumps t...
2    people absolutely concept tariff amp differs g...
3    feeling jvl levels schadenfreude let just say ...
4                                   fuzziest math math
Name: text_no_stopwords, dtype: object

In [22]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
        
def lemmatize_with_pos(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos)) 
        for word, pos in tagged_tokens
    ]
    return " ".join(lemmatized)
    
posts['text_lemmatized'] = posts['text_no_stopwords'].apply(lemmatize_with_pos)
posts['text_lemmatized'].head()

[nltk_data] Downloading package punkt to /Users/patnbe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/patnbe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/patnbe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/patnbe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


0    nasty nasdaq future sampp future tariff news h...
1    investor panic stock market plummet trump tari...
2    people absolutely concept tariff amp differs g...
3    feel jvl level schadenfreude let just say happ...
4                                      fuzzy math math
Name: text_lemmatized, dtype: object

In [31]:
posts.head()

Unnamed: 0,post_id,user_id,full_text,created_At,lang,repost_count,like_count,source,text_preprocessing1,text_preprocessing_no_punctuation,text_no_stopwords,text_lemmatized
0,1907578844636393679,1810361594586738692,"This is nasty \n\nNASDAQ futures down -4.4%, S...",Wed Apr 02 23:40:12 +0000 2025,en,0,0,Twitter Web App,"This is nasty \n\nNASDAQ futures down -.%, S&a...",this is nasty \n\nnasdaq futures down sampp f...,nasty nasdaq futures sampp futures tariff news...,nasty nasdaq future sampp future tariff news h...
1,1907578044492968060,2736639061,Investors Panic as U.S. Stock Market Plummets ...,Wed Apr 02 23:37:01 +0000 2025,en,1,0,Twitter Web App,Investors Panic as U.S. Stock Market Plummets ...,investors panic as us stock market plummets ov...,investors panic stock market plummets trumps t...,investor panic stock market plummet trump tari...
2,1907575218501198232,1850383929829945344,For all the people who have absolutely no conc...,Wed Apr 02 23:25:47 +0000 2025,en,0,2,Twitter for Android,For all the people who have absolutely no conc...,for all the people who have absolutely no conc...,people absolutely concept tariff amp differs g...,people absolutely concept tariff amp differs g...
3,1907574739964510255,449290925,As someone who is feeling JVL levels of schade...,Wed Apr 02 23:23:53 +0000 2025,en,0,0,Twitter for Android,As someone who is feeling JVL levels of schade...,as someone who is feeling jvl levels of schade...,feeling jvl levels schadenfreude let just say ...,feel jvl level schadenfreude let just say happ...
4,1907570533836701721,23709151,The fuzziest math is #TrumpTariff Math.,Wed Apr 02 23:14:59 +0000 2025,en,0,0,Twitter for Android,The fuzziest math is Math.,the fuzziest math is math,fuzziest math math,fuzzy math math


In [33]:
# Export lemmatized_data.csv file for using in BERTopic model that is performed in another environment due to the conflict in versions of some libraries used in both models
posts.to_csv('lemmatized_data.csv', index=False)

In [23]:
# Tokenize text
tokenized_text = posts['text_lemmatized'].apply(lambda x: x.split())

In [24]:
from gensim import corpora

# Create Dictionary
id2word = corpora.Dictionary(tokenized_text)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in tokenized_text]

In [25]:
from gensim.models.ldamodel import LdaModel

# Train LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=10, # number of topics
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [26]:
# Display topics (word distribution in each topic)
topics = lda_model.print_topics()
for idx, topic in topics:
    print(f"Topic {idx + 1}: {topic}")

Topic 1: 0.034*"start" + 0.027*"economic" + 0.025*"dollar" + 0.023*"low" + 0.022*"threat" + 0.021*"global" + 0.018*"impact" + 0.017*"effect" + 0.017*"growth" + 0.016*"break"
Topic 2: 0.053*"trade" + 0.039*"deal" + 0.037*"say" + 0.037*"china" + 0.035*"tariff" + 0.029*"rate" + 0.027*"president" + 0.026*"good" + 0.023*"trump" + 0.019*"new"
Topic 3: 0.094*"cut" + 0.052*"japan" + 0.046*"thats" + 0.034*"im" + 0.028*"administration" + 0.023*"feed" + 0.020*"russia" + 0.019*"ask" + 0.017*"biden" + 0.017*"industry"
Topic 4: 0.080*"iran" + 0.051*"try" + 0.047*"news" + 0.030*"t" + 0.025*"ukraine" + 0.024*"grow" + 0.022*"peace" + 0.021*"late" + 0.020*"figure" + 0.019*"national"
Topic 5: 0.079*"high" + 0.044*"end" + 0.042*"big" + 0.040*"powell" + 0.031*"tell" + 0.026*"pause" + 0.022*"call" + 0.021*"rise" + 0.020*"vote" + 0.019*"lot"
Topic 6: 0.076*"he" + 0.053*"trillion" + 0.052*"lose" + 0.026*"loss" + 0.026*"reserve" + 0.023*"ally" + 0.021*"sampp" + 0.019*"pas" + 0.019*"rice" + 0.019*"ago"
Topic 7:

In [27]:
# Display topic distribution in each document
lda_model.get_document_topics(corpus)
for i, doc in enumerate(corpus):
    topics = lda_model.get_document_topics(doc)
    print(f"Document {i + 1} Topics: {topics}")

Document 1 Topics: [(0, 0.044554777), (1, 0.30959344), (2, 0.02788931), (3, 0.031808753), (4, 0.030124329), (5, 0.06892794), (6, 0.24740621), (7, 0.06260484), (8, 0.12750651), (9, 0.049583893)]
Document 2 Topics: [(0, 0.0543376), (1, 0.17998254), (2, 0.03401166), (3, 0.016807903), (4, 0.05870639), (5, 0.018110251), (6, 0.32184753), (7, 0.076353885), (8, 0.20133828), (9, 0.038503982)]
Document 3 Topics: [(0, 0.053178184), (1, 0.15226021), (2, 0.033287417), (3, 0.016451), (4, 0.03595502), (5, 0.017725855), (6, 0.31914508), (7, 0.09622982), (8, 0.23808326), (9, 0.037684117)]
Document 4 Topics: [(0, 0.07805542), (1, 0.17993605), (2, 0.03479047), (3, 0.017194966), (4, 0.08253545), (5, 0.018527657), (6, 0.2901973), (7, 0.078087546), (8, 0.18128935), (9, 0.039385736)]
Document 5 Topics: [(0, 0.062592894), (1, 0.17735578), (2, 0.03918095), (3, 0.07001231), (4, 0.04232083), (5, 0.020864485), (6, 0.30155823), (7, 0.0879493), (8, 0.15380912), (9, 0.04435609)]
Document 6 Topics: [(0, 0.058219004),

In [28]:
from gensim.models.coherencemodel import CoherenceModel

# Compute c_v coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_text, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print(f"C_v coherence score: {coherence_lda:.4f}") #5 topics =0.3082

C_v coherence score: 0.3179


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualisation
lda_vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_vis)
pyLDAvis.show(lda_vis, local=False, open_browser=True)

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [20/Aug/2025 02:08:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [20/Aug/2025 02:08:27] "GET / HTTP/1.1" 200 -


In [33]:
# pyLDAvis.show(lda_vis, local=False, open_browser=True)