In [3]:
import json
import re
import html
import os
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk

# Ensure NLTK data is available (run this once if needed)
try:
    nltk.data.find('tokenizers/punkt')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: punkt")
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: stopwords")
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: wordnet")
    nltk.download('wordnet')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
# --- Change this line ---
except LookupError: # Changed from nltk.downloader.DownloadError
    print("Downloading NLTK resource: averaged_perceptron_tagger")
    nltk.download('averaged_perceptron_tagger')


lemmatizer = WordNetLemmatizer()

# Define the project root relative to the current notebook's directory
project_root = '../../'

# Define paths relative to the project root
filtered_posts_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'Filtered Posts')
ml_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'ML_Data')
finbert_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'FinBERT_Data')
lexicon_data_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'Lexicon_Data')

# Ensure directories exist
if not os.path.exists(ml_data_dir):
    os.makedirs(ml_data_dir)
if not os.path.exists(finbert_data_dir):
    os.makedirs(finbert_data_dir)
if not os.path.exists(lexicon_data_dir):
    os.makedirs(lexicon_data_dir)

print(f"Filtered posts directory: {filtered_posts_dir}")
print(f"ML Data directory: {ml_data_dir}")
print(f"FinBERT Data directory: {finbert_data_dir}")
print(f"Lexicon Data directory: {lexicon_data_dir}")


Downloading NLTK resource: wordnet
Filtered posts directory: ../../Data\Historical Reddit\Filtered Posts
ML Data directory: ../../Data\Historical Reddit\ML_Data
FinBERT Data directory: ../../Data\Historical Reddit\FinBERT_Data
Lexicon Data directory: ../../Data\Historical Reddit\Lexicon_Data


[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Dictionary for lexicon-based sentiment
sentiment_jargon_dict = {
    # 🚀 Bullish Sentiment / Positive
    "diamond hands": "diamond_hands", "hodl": "hodl", "hodler": "hodler",
    "wagmi": "wagmi", "ape in": "ape_in", "btfd": "btfd", "buy the dip": "buy_the_dip",
    "go long": "go_long", "long it": "long_it", "going long": "going_long",
    "bull run": "bull_run", "bullish": "bullish", "long call": "long_call",
    "long position": "long_position", "long stock": "long_stock",
    "to the moon": "to_the_moon", "moonshot": "moonshot", "ath": "ath",
    "stonks": "stonks", "dca": "dca", "flippening": "flippening",
    "wen lambo": "wen_lambo", "bear trap": "bear_trap",

    # 😬 Bearish Sentiment / Negative
    "paper hands": "paper_hands", "bagholder": "bagholder", "ngmi": "ngmi",
    "rekt": "rekt", "gn": "gn", "bearish": "bearish", "long put": "long_put",
    "exit scam": "exit_scam", "shill": "shill", "rug pull": "rug_pull",
    "pump and dump": "pump_and_dump", "margin call": "margin_call",
    "liquidated": "liquidated", "overleveraged": "overleveraged",
    "capitulate": "capitulate", "bull trap": "bull_trap",
    "short position": "short_position", "short sell": "short_sell",
    "shorting": "shorting", "shorts": "shorts", "shorted": "shorted",
    "short interest": "short_interest", "short call": "short_call",
    "short it": "short_it", "short-selling": "short_selling",
    "short selling": "short_selling"
}

# Comprehensive dictionary for ML preprocessing
ml_jargon_dict = {
    **sentiment_jargon_dict,
    # Neutral financial terms
    "market cap": "market_cap", "price target": "price_target",
    "earnings per share": "eps", "price to earnings": "pe_ratio",
    "return on equity": "roe", "return on investment": "roi",
    "dollar cost averaging": "dollar_cost_averaging", "day trading": "day_trading",
    "swing trading": "swing_trading", "technical analysis": "technical_analysis",
    "fundamental analysis": "fundamental_analysis", "market sentiment": "market_sentiment",
    "trading volume": "trading_volume", "price action": "price_action",
    "support level": "support_level", "resistance level": "resistance_level",
    "moving average": "moving_average", "relative strength": "relative_strength",
    "market trend": "market_trend", "trading strategy": "trading_strategy",
    "investment portfolio": "investment_portfolio", "risk management": "risk_management",
    "position sizing": "position_sizing", "stop loss": "stop_loss",
    "take profit": "take_profit", "market order": "market_order",
    "limit order": "limit_order", "options chain": "options_chain",
    "implied volatility": "implied_volatility", "time value": "time_value",
    "intrinsic value": "intrinsic_value", "dividend yield": "dividend_yield",
    "market maker": "market_maker", "bid ask spread": "bid_ask_spread",
    "liquidity": "liquidity", "market depth": "market_depth",
    "order book": "order_book", "market hours": "market_hours",
    "pre market": "pre_market", "after hours": "after_hours",
    "market open": "market_open", "market close": "market_close"
}


In [5]:
# Combined and conservatively reviewed additional stopwords for sentiment analysis

additional_stopwords = {
    # --- Original list from sentiment notebook ---
    'say', 'like', 'make', 'think', 'know', 'people', 'use', 'want', # Generic conversational noise
    'post', 'link', 'click', 'reddit', 'user', 'thread', 'discussion', 'karma', # Reddit meta noise
    'rule', 'comment', 'mod', 'topic', 'share', 'list',
    'day', 'week', 'month', 'year', 'today', 'january', 'april', 'friday', 'thursday', # Generic time words (original)

    # Generic conversational noise (neutral additions)
    'also', 'back', 'come', 'even', 'every', 'find', 'get', 'give', 'go',
    'keep', 'let', 'look', 'lot', 'many', 'may', 'must', 'new', 'one',
    'part', 'really', 'right', 'see', 'seem', 'something', 'still', 'thing',
    'try', 'way', 'would', # 'well', 'feel', 'much' excluded

    # Reddit meta noise (neutral additions)
    'edit', 'general', 'information', 'meta', 'op', 'poll', 'prior', 'read',
    'sub', 'subreddit', 'welcome', 'please',

    # Generic time words (neutral additions)
    'currently', 'daily', 'et', 'last', 'monthly', 'pm', 'recent', 'since',
    'time', 'tomorrow', 'yesterday', # Added more specific time refs

    # Common actions/verbs/nouns (mostly neutral additions)
    'account', 'around', 'article', 'data', 'different', 'example', 'move',
    'need', 'pay', 'project', 'start', 'take', 'work',

    # Conversational noise (neutral additions)
    'answer', 'ask', 'help', 'hi', 'live', 'open', 'question', 'talk', 'tell', 'you',

    # Artifacts (neutral additions)
    'com', 'dot', 'inc'
}

# Ensure negations are kept for sentiment
stopwords_to_keep = {
    "not", "no", "but", "without", "against", "never", "neither", "nor", "cannot"
}

# Create final custom stopwords set
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords.copy()
custom_stopwords.update(additional_stopwords)
custom_stopwords = custom_stopwords - stopwords_to_keep


In [6]:
def remove_markdown(text):
    """Removes common markdown formatting (but preserves newlines)."""
    # NOTE: DO NOT remove newlines here anymore! It's done in initial_clean.
    # text = re.sub(r'\\n+|\n+', ' ', text)                   # <<-- REMOVED FROM HERE
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)            # Remove bold markdown (**text**)
    text = re.sub(r'__(.*?)__', r'\1', text)                # Remove bold markdown (__text__)
    text = re.sub(r'\*(.*?)\*', r'\1', text)                # Remove italic markdown (*text*)
    text = re.sub(r'_(.*?)_', r'\1', text)                  # Remove italic markdown (_text_) - may affect underscores in words
    text = re.sub(r'^\s*#+\s*(.*?)\s*#*\s*$', r'\1', text, flags=re.MULTILINE) # Remove markdown headers (# Header)
    text = re.sub(r'^\s*>\s?(.*)', r'\1', text, flags=re.MULTILINE) # Remove markdown blockquotes (> quote)
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)         # Remove markdown links, keeping the link text ([text](url) -> text)
    text = re.sub(r'`{1,3}(.*?)`{1,3}', r'\1', text, flags=re.DOTALL) # Remove markdown code ticks/fences (`code` or ```code```)
    text = re.sub(r'~~(.*?)~~', r'\1', text)                # Remove markdown strikethrough (~~text~~)
    text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE) # Remove unordered list markers (*, -, +) at line start
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) # Remove ordered list markers (1., 2.) at line start
    text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE) # Remove horizontal rules (---, ***, ___ )

    # Do not strip here, final stripping happens in initial_clean
    # return text.strip()
    return text

def replace_jargon(text, jargon_dict):
    """Replaces phrases from the jargon dict with their standardized form."""
    for phrase, replacement in jargon_dict.items():
        # Use word boundaries to avoid partial matches within words
        # Make the pattern case-insensitive
        pattern = re.compile(r'\b' + re.escape(phrase) + r'\b', re.IGNORECASE)
        text = pattern.sub(replacement, text)
    return text

def initial_clean(text):
    """Performs initial text cleaning common to most pipelines."""
    text = re.sub(r'<.*?>', '', text)                       # Remove any remaining HTML tags (e.g., <b>, <i>) BEFORE unescaping

    text = html.unescape(text)                              # Unescape HTML entities like &amp; -> &
    text = html.unescape(text)                              # Double unescape just in case some entities were doubly encoded

    # Remove subreddit (r/) and user (u/) references
    # Use word boundaries (\b) to avoid matching mid-word
    # Handles r/subreddit, /r/subreddit, u/user, /u/user formats
    text = re.sub(r'\b[/\\]?[ur]/\w+\b', '', text)

    text = remove_markdown(text)                            # Apply general markdown formatting removal (links, bold, lists etc.) -- *NOTE: Newlines still present at this point*

    # --- Table Removal (Requires newlines to be present for ^ with MULTILINE) ---
    # This single regex handles both |:---| and :--| formats:
    text = re.sub(r'^\s*[|: -]+\|?\s*$\n?', '', text, flags=re.MULTILINE) # Remove separator lines  <- CHECK THIS LINE
    text = re.sub(r'^.*\|(?:.*\|)+.*$\n?', '', text, flags=re.MULTILINE)
    # --- End Table Removal ---

    text = re.sub(r'http\S+|www\.\S+', '', text)             # Remove URLs (http/https or www.)

    try:
        text = contractions.fix(text)                       # Expand contractions (e.g., "don't" -> "do not")
    except Exception as e:
        # print(f"Contraction fixing failed for text: {text[:100]}... Error: {e}") # Optional: uncomment to log errors
        pass                                                # Continue if contraction fixing fails for an edge case

    # --- Final Cleanup ---
    text = re.sub(r'\\n+|\n+', ' ', text)                   # Replace literal '\n' strings and actual newlines with a space *NOW*
    text = re.sub(r'\s+', ' ', text).strip()                # Normalize whitespace (multiple spaces to one space, trim ends)
    return text

In [7]:
# Map Treebank POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    """Maps NLTK POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN # Default to noun

def resolve_us_token(token, pos_tag):
    """Disambiguates 'us' (pronoun) vs 'US' (United States)."""
    # Check the original token's case if needed, but POS tag is usually sufficient
    if token.lower() == "us" and pos_tag == "NNP": # NNP = Proper noun, singular
        return "united_states"
    return token

def tokenize_lemmatize_lowercase(text):
    """Tokenizes, POS tags, lemmatizes, and lowercases text."""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmatized = [
        resolve_us_token(
            lemmatizer.lemmatize(token, get_wordnet_pos(pos)),
            pos
        )
        for token, pos in tagged
    ]
    # Lowercase AFTER lemmatization and US resolution
    lemmatized_lower = [token.lower() for token in lemmatized]
    return lemmatized_lower


In [9]:
def merge_financial_tokens(tokens):
    """Merges adjacent financial tokens ($, numbers, %, tickers) with corrected logic."""
    merged = []
    i = 0
    # print(f"  DEBUG MERGE Input: {tokens}") # Optional debug
    while i < len(tokens):
        current_token = tokens[i]
        next_token = tokens[i + 1] if i + 1 < len(tokens) else None
        merged_flag = False # Flag to track if a merge happened in this iteration

        if next_token:
            # Try merging conditions IN ORDER OF PRECEDENCE/SPECIFICITY

            # Merge $ + comma-formatted number (e.g., "$" + "100,000" → "$100000")
            if current_token == "$" and re.match(r'^\d{1,3}(,\d{3})+(\.\d+)?$', next_token):
                cleaned_num = next_token.replace(',', '')
                merged.append(f"${cleaned_num}")
                i += 2; merged_flag = True # Advance past both tokens

            # Merge $ + number (e.g., "$" + "100" -> "$100")
            elif current_token == "$" and re.match(r'^\d+(\.\d+)?$', next_token):
                 merged.append(f"${next_token}")
                 i += 2; merged_flag = True # Advance past both tokens

            # Merge $ + shorthand amount (e.g., "$" + "30k" → "$30k")
            # Check this BEFORE $ + ticker to avoid capturing '$k' etc.
            elif current_token == "$" and re.match(r'^\d+(\.\d+)?[kKmMbB]$', next_token, re.IGNORECASE): # Added re.IGNORECASE for k/m/b
                merged.append(f"${next_token.lower()}") # Standardize shorthand to lowercase
                i += 2; merged_flag = True # Advance past both tokens

            # Merge number + % (e.g., "5" + "%" → "5%")
            # Ensure the first token is purely numeric before checking for %
            elif re.match(r'^\d+(\.\d+)?$', current_token) and next_token == "%":
                merged.append(f"{current_token}%")
                i += 2; merged_flag = True # Advance past both tokens

            # Merge $ + ticker (e.g., "$" + "aapl" → "$aapl") - handles only lowercase tickers now
            # Ensure it's not just '$' followed by a regular word or stopword
            elif current_token == "$" and re.match(r'^[a-z]{1,7}$', next_token) and next_token not in custom_stopwords and len(next_token)>1:
                 merged.append(f"${next_token}")
                 i += 2; merged_flag = True # Advance past both tokens

        # If no merge occurred involving the current token
        if not merged_flag:
            # Clean stand-alone comma-formatted numbers if they weren't part of a merge
            if re.match(r'^\d{1,3}(,\d{3})+(\.\d+)?$', current_token):
                 cleaned_num = current_token.replace(',', '')
                 merged.append(cleaned_num)
            # Otherwise, just append the current token as is
            else:
                 merged.append(current_token)
            # Advance by only one token since no merge happened (or only cleaning occurred)
            i += 1

    # print(f"  DEBUG MERGE Output: {merged}") # Optional debug
    return merged


In [10]:
# Stopword removal for ML (keeps numbers, $, %, tickers, etc.)
def remove_stopwords_ml(tokens, stopwords_set):
    """Removes stopwords while keeping relevant financial/numeric tokens for ML."""
    cleaned_tokens = []
    for token in tokens:
        # Basic stopword check (case-insensitive, as tokens are lowercased)
        if token in stopwords_set:
            continue

        # Keep merged dollar amounts (e.g., "$100000", "$30k")
        elif re.match(r'^\\$\\d+(\\.\\d+)?[kKmMbB]?$', token):
             cleaned_tokens.append(token)

        # Keep percentages like "5%", "2.5%"
        elif re.match(r'^\\d+(\\.\\d+)?%$', token):
            cleaned_tokens.append(token)

        # Keep merged stock tickers like "$aapl" (lowercase only)
        elif re.match(r'^\\$[a-z]{1,7}$', token): # Adjusted regex
            cleaned_tokens.append(token)

        # Keep numbers (already cleaned of commas by merge function)
        elif re.match(r'^[+-]?\d+(\.\d+)?$', token):
            cleaned_tokens.append(token)

        # Keep alphanumeric words, unders_cored jargon, potentially tickers without '$'
        # Ensure it contains at least one letter to filter out pure punctuation/symbols missed earlier
        elif re.match(r'^[a-zA-Z0-9_-]*[a-zA-Z]+[a-zA-Z0-9_-]*$', token):
            cleaned_tokens.append(token)

        # else: skip remaining punctuation, symbols, or malformed tokens
    return cleaned_tokens

# Stopword removal for Lexicon (keeps only lower alpha/underscore words)
def remove_stopwords_lexicon(tokens, stopwords_set):
    """Removes stopwords, keeping only alphabetic/underscore tokens AND key financial symbols for Lexicon."""
    cleaned_tokens = []
    for token in tokens:
        # Basic stopword check
        if token in stopwords_set:
            continue

        # Keep specific merged dollar amounts (e.g., "$100000", "$30k")
        elif re.match(r'^\$\d+(\.\d+)?[kKmMbB]?$', token):
             cleaned_tokens.append(token)

        # Keep specific percentages like "5%", "2.5%"
        elif re.match(r'^\d+(\.\d+)?%$', token):
            cleaned_tokens.append(token)

        # Keep specific merged stock tickers like "$aapl" (lowercase only)
        elif re.match(r'^\$[a-z]{1,7}$', token):
            cleaned_tokens.append(token)

        # Keep original condition: lower alpha/underscore words (includes underscore_jargon)
        elif re.match(r'^[a-z_]+$', token):
             cleaned_tokens.append(token)

        # else: skip everything else (standalone numbers, other symbols, punctuation)

    return cleaned_tokens


In [11]:
def preprocess_for_finbert_pipeline(text):
    """Pipeline for generating clean text suitable for FinBERT."""
    # FinBERT generally prefers closer-to-original text,
    # so we only apply initial cleaning. No jargon replacement needed.
    cleaned_text = initial_clean(text)
    return cleaned_text

def preprocess_for_lexicon_pipeline(text, jargon_dict, stopwords_set):
    """Pipeline for generating tokens for lexicon-based analysis."""
    # 1. Initial clean (HTML, markdown, URLs, contractions)
    cleaned_text = initial_clean(text)
    # 2. Replace jargon using the sentiment-specific dictionary
    jargon_replaced_text = replace_jargon(cleaned_text, jargon_dict)
    # 3. Further cleanups specific to tokenization (e.g., slashes)
    jargon_replaced_text = re.sub(r'[/]', ' ', jargon_replaced_text)
    jargon_replaced_text = re.sub(r'\\s+', ' ', jargon_replaced_text).strip()
    # 4. Tokenize, lemmatize, lowercase
    lemmatized_tokens = tokenize_lemmatize_lowercase(jargon_replaced_text)
    # 5. Merge financial tokens
    merged_tokens = merge_financial_tokens(lemmatized_tokens)
    # 6. Remove stopwords using the lexicon-specific filter
    final_tokens = remove_stopwords_lexicon(merged_tokens, stopwords_set)
    return final_tokens

def preprocess_for_ml_pipeline(text, jargon_dict, stopwords_set):
    """Pipeline for generating tokens for traditional ML models."""
    # 1. Initial clean (HTML, markdown, URLs, contractions)
    cleaned_text = initial_clean(text)
    # 2. Replace jargon using the comprehensive ML dictionary
    jargon_replaced_text = replace_jargon(cleaned_text, jargon_dict)
    # 3. Further cleanups specific to tokenization (e.g., slashes)
    jargon_replaced_text = re.sub(r'[/]', ' ', jargon_replaced_text)
    jargon_replaced_text = re.sub(r'\\s+', ' ', jargon_replaced_text).strip()
    # 4. Tokenize, lemmatize, lowercase
    lemmatized_tokens = tokenize_lemmatize_lowercase(jargon_replaced_text)
    # 5. Merge financial tokens
    merged_tokens = merge_financial_tokens(lemmatized_tokens)
    # 6. Remove stopwords using the ML-specific filter
    final_tokens = remove_stopwords_ml(merged_tokens, stopwords_set)
    return final_tokens


In [12]:
def process_all_files():
    if not os.path.exists(filtered_posts_dir):
        print(f"Error: Filtered posts directory does not exist: {filtered_posts_dir}")
        return

    # Ensure output directories exist (using updated variables)
    os.makedirs(ml_data_dir, exist_ok=True)
    os.makedirs(finbert_data_dir, exist_ok=True)
    os.makedirs(lexicon_data_dir, exist_ok=True)

    files = [f for f in os.listdir(filtered_posts_dir) if f.endswith('.json') and f.startswith('filtered_r_')]
    if not files:
        print(f"No 'filtered_r_*.json' files found in {filtered_posts_dir}")
        return

    print(f"Found {len(files)} files in {filtered_posts_dir}. Processing...")

    for file in files:
        input_filepath = os.path.join(filtered_posts_dir, file)
        # Define output paths (using updated dirs and new file prefixes)
        ml_output_filepath = os.path.join(ml_data_dir, file.replace("filtered_", "ml_"))
        finbert_output_filepath = os.path.join(finbert_data_dir, file.replace("filtered_", "finbert_"))
        lexicon_output_filepath = os.path.join(lexicon_data_dir, file.replace("filtered_", "lexicon_"))

        try:
            with open(input_filepath, 'r', encoding='utf-8') as infile:
                data = json.load(infile)

            # Use clearer list names
            ml_output_data = []   # For ML results + original data
            finbert_output_data = [] # For FinBERT results
            lexicon_output_data = [] # For Lexicon results

            for post in data:
                post_id = post.get("id")
                if not post_id:
                    print(f"Warning: Post in {file} missing 'id'. Skipping this post.")
                    continue # Skip posts without an ID

                # Combine title and selftext safely
                title = post.get("title", "") or ""
                selftext = post.get("selftext", "") or ""
                combined_text = f"{title} {selftext}".strip()

                # --- Run the Pipelines ---
                # FinBERT Pipeline
                text_for_finbert = preprocess_for_finbert_pipeline(combined_text)
                finbert_output_data.append({ # Append to renamed list
                    "id": post_id,
                    "processed_text_finbert": text_for_finbert
                })

                # Lexicon Pipeline
                lexicon_tokens = preprocess_for_lexicon_pipeline(
                    combined_text, sentiment_jargon_dict, custom_stopwords
                )
                lexicon_output_data.append({ # Append to renamed list
                    "id": post_id,
                    "processed_tokens_lexicon": lexicon_tokens
                })

                # ML Pipeline
                ml_tokens = preprocess_for_ml_pipeline(
                    combined_text, ml_jargon_dict, custom_stopwords
                )

                # Store results for main ML data file (ID + ML Tokens ONLY)
                ml_entry = {
                    "id": post_id,
                    "processed_tokens_ml": ml_tokens
                }
                ml_output_data.append(ml_entry) # Append the specific entry

            # --- Write Outputs ---
            # Write main ML data (Original + ML)
            if ml_output_data: # Use renamed list
                with open(ml_output_filepath, 'w', encoding='utf-8') as outfile: # Use renamed path var
                    json.dump(ml_output_data, outfile, ensure_ascii=False, indent=2) # Use renamed list
                # Update print statement with new path var and description
                print(f"Saved {len(ml_output_data)} ML token sets from {file} -> {ml_output_filepath}")
            else:
                 print(f"No processable posts found in {file} (all might have been missing IDs).")

            # Write FinBERT processed data separately
            if finbert_output_data: # Use renamed list
                with open(finbert_output_filepath, 'w', encoding='utf-8') as finbert_outfile: # Use correct path var
                   json.dump(finbert_output_data, finbert_outfile, ensure_ascii=False, indent=2) # Use renamed list
                # Update print statement with correct path var and description
                print(f"Saved {len(finbert_output_data)} FinBERT texts -> {finbert_output_filepath}")

            # Write Lexicon processed data separately
            if lexicon_output_data: # Use renamed list
                with open(lexicon_output_filepath, 'w', encoding='utf-8') as lexicon_outfile: # Use correct path var
                   json.dump(lexicon_output_data, lexicon_outfile, ensure_ascii=False, indent=2) # Use renamed list
                # Update print statement with correct path var and description
                print(f"Saved {len(lexicon_output_data)} Lexicon tokens -> {lexicon_output_filepath}")


        except json.JSONDecodeError as e:
             print(f"Error decoding JSON from {file}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred processing {file}: {e}")
            import traceback
            traceback.print_exc() # Print full traceback for unexpected errors


In [13]:
process_all_files()
print("\nProcessing complete.") # Optional: confirms when it's done


Found 7 files in ../../Data\Historical Reddit\Filtered Posts. Processing...
Saved 691 ML token sets from filtered_r_bogleheads.json -> ../../Data\Historical Reddit\ML_Data\ml_r_bogleheads.json
Saved 691 FinBERT texts -> ../../Data\Historical Reddit\FinBERT_Data\finbert_r_bogleheads.json
Saved 691 Lexicon tokens -> ../../Data\Historical Reddit\Lexicon_Data\lexicon_r_bogleheads.json
Saved 22142 ML token sets from filtered_r_cryptocurrency.json -> ../../Data\Historical Reddit\ML_Data\ml_r_cryptocurrency.json
Saved 22142 FinBERT texts -> ../../Data\Historical Reddit\FinBERT_Data\finbert_r_cryptocurrency.json
Saved 22142 Lexicon tokens -> ../../Data\Historical Reddit\Lexicon_Data\lexicon_r_cryptocurrency.json
Saved 11471 ML token sets from filtered_r_investing.json -> ../../Data\Historical Reddit\ML_Data\ml_r_investing.json
Saved 11471 FinBERT texts -> ../../Data\Historical Reddit\FinBERT_Data\finbert_r_investing.json
Saved 11471 Lexicon tokens -> ../../Data\Historical Reddit\Lexicon_Data\l