In [None]:
import json
import re
import html
import os
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk

# Ensure NLTK data is available (run this once if needed)
try:
    nltk.data.find('tokenizers/punkt')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: punkt")
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: stopwords")
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: wordnet")
    nltk.download('wordnet')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: averaged_perceptron_tagger")
    nltk.download('averaged_perceptron_tagger_eng')


lemmatizer = WordNetLemmatizer()

# Define the project root relative to the current notebook's directory
project_root = '../../'

# Define paths relative to the project root
filtered_posts_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'Filtered Posts')
lda_nmf_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'LDA_NMF_Data')
bertopic_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'BERTopic_Data')

# Ensure directories exist
if not os.path.exists(lda_nmf_data_dir):
    os.makedirs(lda_nmf_data_dir, exist_ok=True)
if not os.path.exists(bertopic_data_dir):
    os.makedirs(bertopic_data_dir, exist_ok=True)

print(f"Filtered posts directory: {filtered_posts_dir}")
print(f"LDA/NMF Data directory: {lda_nmf_data_dir}")
print(f"BERTopic Data directory: {bertopic_data_dir}")


In [2]:
# Define the default stop words list
default_stopwords = set(stopwords.words('english'))
additional_stopwords = {
        # Generic conversational noise (Sorted Alphabetically)
        'also', 'anyone', 'back', 'bad', 'check', 'come', 'content', 'could', 'everyone', 'even',
        'every', 'feel', 'find', 'get', 'give', 'go', 'good', 'great', 'group', 'guy',
        'important', 'keep', 'know', 'let', 'like', 'look', 'lot', 'make', 'many', 'may',
        'much', 'must', 'never', 'new', 'number', 'one', 'part', 'people', 'really',
        'result', 'right', 'say', 'see', 'seem', 'something', 'still', 'thing', 'think',
        'try', 'use', 'video', 'want', 'way', 'well', 'would',

        # Reddit meta noise (Sorted Alphabetically)
        'click', 'comment', 'disclaimer', 'discussion', 'edit', 'general', 'information',
        'karma', 'link', 'list', 'meta', 'mod', 'op', 'poll', 'post', 'prior', 'read',
        'reddit', 'rule', 'share', 'sub', 'subreddit', 'thread', 'topic', 'user', 'welcome',
        'please', # 'please' kept separate as it's less descriptive

        # Generic time words (Sorted Alphabetically)
        'currently', 'daily', 'day', 'et', 'friday', 'january', 'last', 'month', 'monthly', 'pm',
        'recent', 'since', 'time', 'today', 'tomorrow', 'week', 'year', 'yesterday',

        # Common actions, verbs, nouns (Sorted Alphabetically)
        'account', 'around', 'article', 'data', 'different', 'example', 'move', 'need', 'pay',
        'project', 'start', 'take', 'work',

        # Vague/common descriptors (Sorted Alphabetically)
        'average', 'big', 'high', 'low', 'max', 'medium',

        # Conversational noise (Sorted Alphabetically)
        'answer', 'ask', 'help', 'hi', 'live', 'open', 'question', 'talk', 'tell', 'you',

        # Artifacts (Sorted Alphabetically)
        'com', 'dot', 'free', 'inc',

        # Potentially add domain-specific noise words? (Be cautious here)
        # 'stock', 'market', 'price', 'company', 'trading', 'shares', 'buy', 'sell', 'crypto' # KEEPING THESE OUT FOR NOW
    }

# Create custom stopwords set FOR LDA/NMF (includes negations if they are in default/additional)
custom_stopwords = default_stopwords.copy()
custom_stopwords.update(additional_stopwords)


In [3]:
def remove_markdown(text):
    """Removes common markdown formatting (but preserves newlines)."""
    # NOTE: DO NOT remove newlines here anymore! It's done in initial_clean.
    # text = re.sub(r'\\n+|\n+', ' ', text)                   # <<-- REMOVED FROM HERE
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)            # Remove bold markdown (**text**)
    text = re.sub(r'__(.*?)__', r'\1', text)                # Remove bold markdown (__text__)
    text = re.sub(r'\*(.*?)\*', r'\1', text)                # Remove italic markdown (*text*)
    text = re.sub(r'_(.*?)_', r'\1', text)                  # Remove italic markdown (_text_) - may affect underscores in words
    text = re.sub(r'^\s*#+\s*(.*?)\s*#*\s*$', r'\1', text, flags=re.MULTILINE) # Remove markdown headers (# Header)
    text = re.sub(r'^\s*>\s?(.*)', r'\1', text, flags=re.MULTILINE) # Remove markdown blockquotes (> quote)
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)         # Remove markdown links, keeping the link text ([text](url) -> text)
    text = re.sub(r'`{1,3}(.*?)`{1,3}', r'\1', text, flags=re.DOTALL) # Remove markdown code ticks/fences (`code` or ```code```)
    text = re.sub(r'~~(.*?)~~', r'\1', text)                # Remove markdown strikethrough (~~text~~)
    text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE) # Remove unordered list markers (*, -, +) at line start
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) # Remove ordered list markers (1., 2.) at line start
    text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) # Remove horizontal rules (---, ***, ___ )

    # Do not strip here, final stripping happens in initial_clean
    # return text.strip()
    return text

def replace_jargon(text, jargon_dict):
    """Replaces phrases from the jargon dict with their standardized form."""
    for phrase, replacement in jargon_dict.items():
        # Use word boundaries to avoid partial matches within words
        # Make the pattern case-insensitive
        pattern = re.compile(r'\b' + re.escape(phrase) + r'\b', re.IGNORECASE)
        text = pattern.sub(replacement, text)
    return text

def initial_clean(text):
    """Performs initial text cleaning common to most pipelines."""
    us_placeholder = "__US_PLACEHOLDER__"

    # --- Original Cleaning Steps FIRST ---
    text = re.sub(r'<.*?>', '', text)                       # Remove HTML tags
    text = html.unescape(text)                              # Unescape HTML entities
    text = html.unescape(text)
    text = re.sub(r'\b[/\\]?[ur]/\w+\b', '', text)           # Remove r/u/ references
    text = remove_markdown(text)                            # Apply general markdown formatting removal
    text = re.sub(r'^\s*[|: -]+\|?\s*$\n?', '', text, flags=re.MULTILINE) # Table separators
    text = re.sub(r'^.*\|(?:.*\|)+.*$\n?', '', text, flags=re.MULTILINE) # Table content
    text = re.sub(r'http\S+|www\.\S+', '', text)             # Remove URLs

    # Step X: Replace U.S. variants AFTER markdown but BEFORE contractions
    text = re.sub(r'U\.S\.', us_placeholder, text, flags=re.IGNORECASE)
    text = re.sub(r'\bUS\b', us_placeholder, text)

    # Expand contractions (NOW SAFE for U.S. and placeholder)
    try:
        text = contractions.fix(text)
    except Exception as e:
        pass # Continue if contraction fixing fails

    # --- Final Cleanup ---
    text = re.sub(r'\\n+|\n+', ' ', text)                   # Replace newlines with space
    text = re.sub(r'\s+', ' ', text).strip()           # Normalize whitespace

    # Step N: Restore "U.S." from the placeholder
    text = text.replace(us_placeholder, "U.S.") # Restore placeholder

    return text


In [4]:
# Map Treebank POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    """Maps NLTK POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN # Default to noun

def resolve_us_token(token, pos):
    """Convert 'U.S.' variants to 'united_states' if tagged as NNP/NNPS."""
    # Check POS tag and the lowercased token
    if pos in ["NNP", "NNPS"] and token.lower() in ["us", "u.s.", "u.s"]:
        return "united_states"
    return token

def tokenize_lemmatize_lowercase(text):
    """Tokenizes, POS tags, resolves U.S., lemmatizes, lowercases, and cleans quotes."""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    processed_tokens = [
        lemmatizer.lemmatize(resolve_us_token(token, pos), get_wordnet_pos(pos)).lower()
        for token, pos in tagged
    ]

    # Clean leading/trailing quotes
    # Note: Removed trailing quote strip based on previous tests/logic
    cleaned_tokens = [token.lstrip("'") for token in processed_tokens]
    final_tokens = [token for token in cleaned_tokens if token] # Filter empty strings

    return final_tokens


In [5]:
def remove_stopwords_topic_model(tokens, stopwords_set):
    """Removes stopwords, single-character tokens, specified artifacts, and numeric tokens for LDA/NMF."""
    cleaned_tokens = []
    # Use the more robust numeric pattern that handles various formats
    numeric_pattern = re.compile(r'^\s*[+-]?(\d{1,3}(?:[.,]\d{3})*|\d+)(?:[.,]\d+)?\s*$')
    # Reinstate the full set of artifacts we want removed
    punctuation_artifacts = {"''", "'s", "``", "--", "-", "\\\\-", "...", "`", "p."}

    for token in tokens:
        # 1. Skip if stopword
        if token in stopwords_set:
            continue

        # 2. Skip if specific punctuation artifact
        if token in punctuation_artifacts:
            continue

        # 3. Skip if purely numeric using fullmatch
        if numeric_pattern.fullmatch(token):
             continue

        # 4. Keep token ONLY if it's longer than 1 character AND contains at least one word character
        # Reinstates the stricter single-character filtering and general content check
        if len(token) > 1 and re.search(r'\w', token):
             cleaned_tokens.append(token)

    return cleaned_tokens

def preprocess_for_bertopic(text):
    """Pipeline for generating clean text suitable for BERTopic."""
    # Only apply initial cleaning. Keep text structure.
    cleaned_text = initial_clean(text)
    # Optional: Minimal further cleanup if needed, but avoid tokenization/lemmatization here
    cleaned_text = re.sub(r'[/]', ' ', cleaned_text) # Example: replace slashes
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def preprocess_for_lda_nmf(text, stopwords_set):
    """Pipeline for generating tokens for LDA/NMF models."""
    # 1. Initial clean (HTML, markdown, URLs, contractions)
    cleaned_text = initial_clean(text)

    # 2. (Optional) Replace jargon - skipping for now
    jargon_replaced_text = cleaned_text # Use this line if skipping jargon replacement

    # 3. Further cleanups specific to tokenization (e.g., slashes)
    jargon_replaced_text = re.sub(r'[/]', ' ', jargon_replaced_text)
    jargon_replaced_text = re.sub(r'\s+', ' ', jargon_replaced_text).strip()

    # 4. Tokenize, lemmatize, lowercase
    lemmatized_tokens = tokenize_lemmatize_lowercase(jargon_replaced_text)

    # 5. Skip financial token merging (if applicable)

    # 6. Remove stopwords aggressively using the topic-model-specific filter
    final_tokens = remove_stopwords_topic_model(lemmatized_tokens, stopwords_set)
    return final_tokens


In [6]:
def process_all_files():
    if not os.path.exists(filtered_posts_dir):
        print(f"Error: Filtered posts directory does not exist: {filtered_posts_dir}")
        return

    lda_nmf_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'LDA_NMF_Data')
    bertopic_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'BERTopic_Data')

    # Ensure output directories exist
    os.makedirs(lda_nmf_data_dir, exist_ok=True)
    os.makedirs(bertopic_data_dir, exist_ok=True)

    print(f"LDA/NMF Data directory: {lda_nmf_data_dir}")
    print(f"BERTopic Data directory: {bertopic_data_dir}")


    files = [f for f in os.listdir(filtered_posts_dir) if f.endswith('.json') and f.startswith('filtered_r_')]
    if not files:
        print(f"No 'filtered_r_*.json' files found in {filtered_posts_dir}")
        return

    print(f"\nFound {len(files)} files in {filtered_posts_dir}. Processing for Topic Modeling...")

    for file in files:
        input_filepath = os.path.join(filtered_posts_dir, file)
        lda_nmf_output_filepath = os.path.join(lda_nmf_data_dir, file.replace("filtered_", "lda_nmf_"))
        bertopic_output_filepath = os.path.join(bertopic_data_dir, file.replace("filtered_", "bertopic_"))

        try:
            with open(input_filepath, 'r', encoding='utf-8') as infile:
                data = json.load(infile)

            lda_nmf_output_data = []
            bertopic_output_data = []

            for post in data:
                post_id = post.get("id")
                if not post_id:
                    print(f"Warning: Post in {file} missing 'id'. Skipping this post.")
                    continue

                title = post.get("title", "") or ""
                selftext = post.get("selftext", "") or ""
                combined_text = f"{title} {selftext}".strip()

                # --- Run the BERTopic Pipeline ---
                text_for_bertopic = preprocess_for_bertopic(combined_text)
                bertopic_output_data.append({
                    "id": post_id,
                    "processed_text_bertopic": text_for_bertopic
                    # Optionally include original text or other metadata if needed later
                    # "original_title": title,
                    # "original_selftext": selftext
                })

                # --- Run the LDA/NMF Pipeline ---
                lda_nmf_tokens = preprocess_for_lda_nmf(combined_text, custom_stopwords)
                lda_nmf_output_data.append({
                    "id": post_id,
                    "processed_tokens_lda_nmf": lda_nmf_tokens
                })

            # --- Write Outputs ---
            # Write LDA/NMF data
            if lda_nmf_output_data:
                with open(lda_nmf_output_filepath, 'w', encoding='utf-8') as outfile:
                    json.dump(lda_nmf_output_data, outfile, ensure_ascii=False, indent=2)
                print(f"Saved {len(lda_nmf_output_data)} LDA/NMF token sets from {file} -> {lda_nmf_output_filepath}")

            # Write BERTopic data
            if bertopic_output_data:
                with open(bertopic_output_filepath, 'w', encoding='utf-8') as outfile:
                   json.dump(bertopic_output_data, outfile, ensure_ascii=False, indent=2)
                print(f"Saved {len(bertopic_output_data)} BERTopic texts from {file} -> {bertopic_output_filepath}")

            if not lda_nmf_output_data and not bertopic_output_data:
                 print(f"No processable posts found in {file} (all might have been missing IDs).")


        except json.JSONDecodeError as e:
             print(f"Error decoding JSON from {file}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred processing {file}: {e}")
            import traceback
            traceback.print_exc()


In [None]:
process_all_files()
print("\nProcessing complete.")


In [None]:
# # --- Test Cases ---
# test_cases = [
#     # Basic Cleaning
#     ("  Test &amp; example with   extra spaces. Don't forget!  ", "Test & example with extra spaces. Do not forget!"),
#     ("Visit www.example.com or https://test.org/path?q=1", "Visit or"),
#     ("Here's some <b>bold</b> text and <a href='#'>a link</a>.", "Here is some bold text and a link."),
#     ("It&apos;s double encoded: &amp;amp;", "It is double encoded: &"),

#     # Markdown Specific
#     ("# Header\n*italic* and **bold** ~~strike~~ `code`", "Header italic and bold strike code"),
#     ("A line.\n\n---\n\nAnother line.", "A line. Another line."), # Horizontal rule
#     ("> Blockquote here", "Blockquote here"),
#     ("Check [this link](http://example.com) and ![alt text](img.jpg)", "Check this link and"),

#     # Markdown Tables
#     ("Text before.\n| Header 1 | Header 2 |\n|---|---|\n| Cell 1 | Cell 2 |\nText after.", "Text before. Text after."),
#     ("Table at start:\n| A | B |\n|---|---|\n| 1 | 2 |", "Table at start:"),
#     ("| C | D |\n|---|---|\n| 3 | 4 |\nEnd table.", "End table."),
#     ("No table here, just | pipes | in text.", "No table here, just | pipes | in text."), # Should not remove this
#     ("Text\n\n| Head |\n|:--|\n| Val |\n\nMore text", "Text More text"), # Table with surrounding newlines

#     # Edge Cases & Combinations
#     ("It's <br>broken &amp; messy www.site.com\n\n| T1 | T2 |\n|--|--|\n| V1 | V2 |", "It is broken & messy"), # Combo
#     ("", ""), # Empty string
#     ("  ", ""), # Only whitespace
#     ("`code` don't remove &lt;tag&gt;", "code do not remove <tag>"), # Mixed entities/markdown/contractions
#     ("Line with / slash", "Line with / slash"), # Slash preserved (assuming slash replacement is removed)

#     # Test case similar to the problematic Bogleheads post (simplified)
#     ("First $100,000 saved!\n\n| Salary | Date |\n|---|---|\n| $32,000 | 12/31/2015 |\n| $60,000 | 12/31/2016 |\n\nTotal Growth $27,000", "First $100,000 saved! Total Growth $27,000") # Note: Symbols ($ ,) are removed later
# ]

# # --- Run Tests ---
# print("--- Running initial_clean Tests ---")
# passed = 0
# failed = 0
# for i, (input_text, expected_output) in enumerate(test_cases):
#     actual_output = initial_clean(input_text)
#     if actual_output == expected_output:
#         print(f"[PASS] Test Case {i+1}")
#         passed += 1
#     else:
#         print(f"[FAIL] Test Case {i+1}")
#         print(f"  Input:    '{input_text}'")
#         print(f"  Expected: '{expected_output}'")
#         print(f"  Actual:   '{actual_output}'")
#         failed += 1
# print("--- Test Summary ---")
# print(f"Passed: {passed}")
# print(f"Failed: {failed}")
# print("--- End Tests ---")


In [9]:
# lda_nmf_test_cases = [
#     # Basic cases
#     ("He said 'hello'", ["hello"]),          # Leading ' removed, 'hello' kept

#     # Previous number tests (should still pass if number logic is correct)
#     ("Price is 15.000.000 dollars", ["price", "dollar"]),
#     ("Gained +300.000 points", ["gained", "point"]),
#     ("Increase of 1,500,000", ["increase"]),
#     ("Profit: $50,000.50", ["profit"]),
#     ("Down -50 points", ["point"]),
#     ("Only 1 left", ["leave"]),
#     ("Number 000", ["number"]),
#     ("First $100,000 saved!", ["first", "save"]),
#     ("Stock ABC went up by 5.5% to 123.456", ["stock", "abc"]),
#     ("go 123 45.678 hello", ["hello"]),

#     # New tests for specific punctuation/symbols (with updated expectations)
#     ("A quick break -- then continue", ["quick", "break", "continue"]), # Expect -- removed
#     ("Check item \\- it is important", ["check", "item", "important"]), # Expect \- removed
#     ("Check item - it is important", ["check", "item", "important"]),      # Expect - removed
#     ("Wait ... what happened?", ["wait", "happen"]),                       # Expect ... removed, happened -> happen
#     ("He said ''wait''", ["wait"]),                                        # Expect `` removed (was '')
#     ("She mentioned \"stop\"", ["mention", "stop"]),                       # Expect `` removed (was "), mentioned -> mention
#     ("It's a test", ["test"]),                                             # Expect 's removed
#     ("Symbols like : or ; or _", ["symbols"]),

#     # --- NEW TEST CASES FOR SINGLE LETTERS ---
#     ("Go to r/subreddit", []),                 # Expect 'r' removed (assuming 'go', 'to' are stopwords)
#     ("Can u tell me?", []),                         # Expect 'u' removed (assuming 'can', 'me' are stopwords)
#     ("See page p. 5", ["page"]),                   # Expect 'p' removed (assuming '.' and '5' are removed) - 'see' kept if not stopword
#     ("Can u see p. 1 of r/stocks?", []),    # Combine 'u', 'p', 'r' (assuming 'can', 'of', '.' '1' are removed/stopwords)
#     ("a b c d e f g h i j k l m n o p q r s t u v w x y z", []), # Keep single letters ONLY if they aren't stopwords ('a', 'i', 'o' likely are, 'c', 'e' maybe?) AND not 'r', 'u', 'p'

#     # --- NEW TEST CASES FOR U.S. / you.s ---
#     ("The U.S. economy is strong.", ["united_states", "economy", "strong"]), # Expected resolution
#     ("He visited the U.S.", ["visit", "united_states"]), # Expected resolution
#     ("Focus on u.s. stocks", ["focus", "united_states", "stock"]), # Lowercase input
#     ("U.S. dollars are used", ["united_states", "dollar"]), # With another word
#     ("Compare U.S. and China", ["compare", "united_states", "china"]), # Plural context?
#     ("Is this related to us?", ["related"]), # Pronoun 'us', should be removed by stopwords
# ]

# # --- Test Runner Code ---
# print("--- Running preprocess_for_lda_nmf Number Tests ---")
# passed_lda = 0
# failed_lda = 0
# for i, (input_text, expected_tokens) in enumerate(lda_nmf_test_cases):
#     # Pass the custom_stopwords set defined earlier in the notebook
#     actual_tokens = preprocess_for_lda_nmf(input_text, custom_stopwords)
#     # Sort for comparison consistency
#     if sorted(actual_tokens) == sorted(expected_tokens):
#         print(f"[PASS] LDA/NMF Test Case {i+1}")
#         passed_lda += 1
#     else:
#         print(f"[FAIL] LDA/NMF Test Case {i+1}")
#         print(f"  Input:    '{input_text}'")
#         print(f"  Expected: {sorted(expected_tokens)}")
#         print(f"  Actual:   {sorted(actual_tokens)}")
#         failed_lda += 1

# print("--- LDA/NMF Test Summary ---")
# print(f"Passed: {passed_lda}")
# print(f"Failed: {failed_lda}")
# print("--- End LDA/NMF Tests ---")
