In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup
import spacy
from spacy_cleaner import processing, Cleaner
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel


def split_documents_by_words(documents, max_words=512):
    """
    Split documents if one document's word count is over than max_words.

    Args:
        documents (list): List of documents as strings.
        max_words (int): Maximum number of words for each document.

    Returns:
        list: List of split documents.
    """
    split_documents = []
    for doc in documents:
        words = doc.split()
        num_words = len(words)
        if num_words <= max_words:
            split_documents.append(doc)
        else:
            # Split document into segments of max_words
            num_segments = num_words // max_words
            for i in range(num_segments + 1):
                start_idx = i * max_words
                end_idx = (i + 1) * max_words
                if ' '.join(words[start_idx:end_idx]) != '' or ' '.join(words[start_idx:end_idx]) != ' ':
                    split_documents.append(' '.join(words[start_idx:end_idx]))
    return split_documents

df = pd.read_json('/home/yy2046/Workspace/DCEE2023/datasets/reddit/subreddit_posts_updated.json')
df.drop_duplicates(subset=['title', 'selftext'], inplace=True)
data = [row.title + ' ' + str(row.selftext) for index, row in df.iterrows()]

''' preprocess '''
cleaned_data = []
model = spacy.load("en_core_web_sm")
cleaner = Cleaner( 
    model,
    processing.remove_stopword_token,
    processing.remove_punctuation_token,
    processing.remove_email_token,
    processing.remove_url_token,
    processing.mutate_lemma_token,
    
)

for html_text in data:
    soup = BeautifulSoup(html_text, 'html.parser')
    soup_text = soup.get_text().lower()
    cleaned_data.append(soup_text)
# print(cleaned_data[0])
print('spaCy preprocess start!')
cleaned_data = cleaner.clean(cleaned_data)
# print(cleaned_data[0])
print('spaCy preprocess done!')

input_data = split_documents_by_words(cleaned_data, max_words=512)
input_tokenized_data = [[token.text for token in model(doc)] for doc in input_data]
input_dictionary = corpora.Dictionary(input_tokenized_data)
input_corpus = [input_dictionary.doc2bow(doc) for doc in input_tokenized_data]


co_tokenized_data = [[token.text for token in model(doc)] for doc in cleaned_data]
co_dictionary = corpora.Dictionary(co_tokenized_data)

n_topics_options = [2]
alpha_options = [0.5]
eta_options = [0.04]

# data structure used to store results
results = []

for n_topics in n_topics_options:
    for alpha in alpha_options:
        for eta in eta_options:
            print(f"Training LDA model with n_topics={n_topics}, alpha={alpha}, eta={eta}...")
            lda_model = LdaModel(corpus=input_corpus, id2word=input_dictionary, num_topics=n_topics, 
                                 alpha=alpha, eta=eta, random_state=42, per_word_topics=True)
            print(lda_model.show_topics())

            try:
                # calculate Coherence score using c_npmi
                coherence_model_lda = CoherenceModel(model=lda_model, texts=co_tokenized_data, 
                                                     dictionary=co_dictionary, coherence='c_npmi')
                coherence_lda = coherence_model_lda.get_coherence()
                print(f"Coherence (c_npmi) score for n_topics={n_topics}, alpha={alpha}, eta={eta}: {coherence_lda}")
            except Exception as e:
                print(f"Failed to calculate coherence for n_topics={n_topics}, alpha={alpha}, eta={eta}. Error: {e}")
                coherence_lda = -99

            # save the results of the current model
            results.append({
                'n_topics': n_topics,
                'alpha': alpha,
                'eta': eta,
                'coherence': coherence_lda
            })

# store as CSV file
results_df = pd.DataFrame(results)
results_df.to_csv('lda_gs_res_reddit.csv', index=False)

print("Optimization completed. Results are saved to lda_gs_res_reddit.csv")

# print best parameters
best_result = results_df.loc[results_df['coherence'].idxmax()]

print("\nBest Parameters:")
print(f"n_topics: {best_result['n_topics']}, alpha: {best_result['alpha']}, eta: {best_result['eta']}")
print(f"Best Coherence (c_npmi) score: {best_result['coherence']}")

  soup = BeautifulSoup(html_text, 'html.parser')


spaCy preprocess start!


Cleaning Progress: 100%|██████████| 708/708 [00:02<00:00, 281.27it/s]


spaCy preprocess done!
Training LDA model with n_topics=2, alpha=0.5, eta=0.04...
[(0, '0.037*"economy" + 0.036*"circular" + 0.008*"new" + 0.008*"waste" + 0.008*"product" + 0.006*"plastic" + 0.006*"help" + 0.006*"recycle" + 0.005*"sustainable" + 0.005*"need"'), (1, '0.037*"circular" + 0.025*"economy" + 0.011*"waste" + 0.010*"plastic" + 0.008*"business" + 0.006*"sustainable" + 0.006*"product" + 0.006*"new" + 0.006*"good" + 0.005*"help"')]
Coherence (c_npmi) score for n_topics=2, alpha=0.5, eta=0.04: -0.06706314036633146
Optimization completed. Results are saved to lda_gs_res_reddit.csv

Best Parameters:
n_topics: 2.0, alpha: 0.5, eta: 0.04
Best Coherence (c_npmi) score: -0.06706314036633146
