In [39]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
from hdbscan import HDBSCAN
from umap import UMAP

In [40]:
eng_stopwords = stopwords.words('english')


In [41]:
tagalog_stopwords = [
    "ako", "ikaw", "siya", "sila", "kami", "tayo", "kayo", "nila", "namin", "natin",
    "ko", "mo", "niya", "kanila", "atin", "amin", "iyo", "inyo", "ito", "iyan", "iyon",
    "dito", "diyan", "doon", "ganito", "ganyan", "ganun", "ganon", "mga",
    "na", "pa", "din", "rin", "nga", "eh", "ba", "daw", "pala", "naman", "kasi",
    "sana", "talaga", "lang", "yata", "pati", "muna", "lamang", "lalo", "kung",
    "parang", "dapat", "naka", "kay", "para", "habang", "basta", "pero", "at", "o",
    "hindi", "oo", "opo", "syempre", "wala", "meron", "may",
    "ngayon", "kahapon", "kanina", "mamaya", "bukas",
    "sobrang", "grabe", "super", "medyo",
    "uy", "hala", "naku", "sus", "tsk", "ay", "oooh", "ah",
    "haha", "hahaha", "hehe", "lol", "amp", "omg", "lmao", "huhu",
    "sa", "ng", "ang", "yan"  # Add these common Tagalog words that are appearing
]

In [42]:
all_stopwords = list(set([word.lower().strip() for word in eng_stopwords + tagalog_stopwords]))
vectorizer_model = CountVectorizer(stop_words=all_stopwords, min_df=2, max_df=0.8)

In [43]:
document = pd.read_csv('../../reddit-scraper/cleaned_reddit_comments.csv')
comments = document['comment'].tolist()


# Reduce min_cluster_size to get more topics
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Optionally, reduce UMAP dimensions for more granular clustering
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
topic_model = BERTopic(
    embedding_model=sentence_model, 
    language="english", 
    calculate_probabilities=True, 
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,

    )
topics, probs = topic_model.fit_transform(comments) 

print(topic_model.get_topic_info())
print(topic_model.get_topic(0))


   Topic  ...                                Representative_Docs
0     -1  ...  [guys trust me. dont park your money in Philip...
1      0  ...  [Gimmick nalng ng senate at congress yan. Sila...
2      1  ...  [Nasa The Hague yung isa., Yung nasa Hague, Na...

[3 rows x 5 columns]
[('kaya', np.float64(0.11211702947182937)), ('di', np.float64(0.10154881620880157)), ('government', np.float64(0.09049137618604863)), ('us', np.float64(0.08999822306882585)), ('people', np.float64(0.08378658835601732)), ('corrupt', np.float64(0.08378658835601732)), ('nag', np.float64(0.08378658835601732)), ('puro', np.float64(0.07723607896258103)), ('time', np.float64(0.07723607896258103)), ('salita', np.float64(0.07029967075967115))]




In [44]:
# Print a table of all topics and their labels (top words)
topic_info = topic_model.get_topic_info()
print(topic_info[['Topic', 'Name']])  # 'Name' column contains the label

# Or, for each topic, print the top words:
for topic_num in topic_info['Topic']:
    if topic_num == -1:  # -1 is usually for outliers
        continue
    print(f"Topic {topic_num}: {topic_model.get_topic(topic_num)}")

   Topic                           Name
0     -1  -1_take_philippines_want_even
1      0        0_kaya_di_government_us
2      1       1_hague_nasa_duterte_isa
Topic 0: [('kaya', np.float64(0.11211702947182937)), ('di', np.float64(0.10154881620880157)), ('government', np.float64(0.09049137618604863)), ('us', np.float64(0.08999822306882585)), ('people', np.float64(0.08378658835601732)), ('corrupt', np.float64(0.08378658835601732)), ('nag', np.float64(0.08378658835601732)), ('puro', np.float64(0.07723607896258103)), ('time', np.float64(0.07723607896258103)), ('salita', np.float64(0.07029967075967115))]
Topic 1: [('hague', np.float64(0.4518012090523637)), ('nasa', np.float64(0.43329065098233)), ('duterte', np.float64(0.20011165913032355)), ('isa', np.float64(0.14472228897857536)), ('sure', np.float64(0.10867504669365012)), ('lead', np.float64(0.07644895671619938)), ('sina', np.float64(0.07644895671619938)), ('mean', np.float64(0.07644895671619938)), ('allies', np.float64(0.076448956716199

In [45]:
topic_model.save("bertopic_model")

