In [None]:
import os
import csv
import json
import time
import sys
import re
import requests
import argparse
import logging
from typing import Dict, List, Any, Optional, Tuple
from collections import defaultdict
import concurrent.futures  # Added for parallel processing

# Configuration with fixed Chatbox and deepseek8b
class Config:
    # Input/Output settings
    INPUT_DIR = "8k_filings_json"  # Directory with downloaded JSON 8-K filings
    OUTPUT_FILE = "product_announcements.csv"  # Output CSV file
    MAX_PRODUCTS = 100  # Maximum number of products to find
    LOG_DIR = "llm_logs"  # Directory for logging LLM responses
    PROGRESS_FILE = "processing_progress.json"  # File to track processing progress
    
    # Fixed Chatbox with deepseek8b configuration
    LLM_API_URL = "http://127.0.0.1:11434"
    LLM_MODEL = "deepseek-r1:8b"
    LLM_CHAT_PATH = "/v1/chat/completions"
    
    # Parallel processing settings (added)
    MAX_WORKERS = 4 # Maximum number of parallel workers
    BATCH_SIZE = 4  # Number of filings to process in a batch

def is_running_in_notebook():
    """Check if script is running in Jupyter/IPython environment"""
    try:
        from IPython import get_ipython
        if get_ipython() is not None:
            return True
        return False
    except ImportError:
        return False

def load_processing_progress():
    """Load the list of already processed filings from progress file"""
    processed_files = set()
    
    if os.path.exists(Config.PROGRESS_FILE):
        try:
            with open(Config.PROGRESS_FILE, 'r', encoding='utf-8') as f:
                progress_data = json.load(f)
                processed_files = set(progress_data.get('processed_files', []))
                print(f"📋 Loaded progress data: {len(processed_files)} filings already processed")
        except Exception as e:
            print(f"⚠️ Error loading progress file: {str(e)}")
    
    return processed_files

def save_processing_progress(processed_files):
    """Save the list of processed filings to progress file"""
    try:
        progress_data = {
            'processed_files': list(processed_files),
            'last_update': time.strftime("%Y-%m-%d %H:%M:%S")
        }
        
        with open(Config.PROGRESS_FILE, 'w', encoding='utf-8') as f:
            json.dump(progress_data, f, indent=2)
            
        print(f"💾 Saved progress data: {len(processed_files)} filings processed so far")
    except Exception as e:
        print(f"⚠️ Error saving progress file: {str(e)}")

def parse_arguments():
    """Parse command line arguments to configure the script behavior"""
    parser = argparse.ArgumentParser(description="Extract product announcements from JSON 8-K filings using a local LLM")
    
    # Input/Output settings
    parser.add_argument("--input-dir", type=str, help="Directory with JSON 8-K filings")
    parser.add_argument("--output-file", type=str, help="Output CSV file")
    parser.add_argument("--max-products", type=int, help="Maximum number of products to find")
    parser.add_argument("--log-dir", type=str, help="Directory for LLM response logs")
    
    # Additional options
    parser.add_argument("--chunk-size", type=int, default=6000, help="Maximum text chunk size for the LLM")
    parser.add_argument("--test-only", action="store_true", help="Just test LLM connection without processing files")
    parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
    parser.add_argument("--reset", action="store_true", help="Reset progress and start from the beginning")
    parser.add_argument("--prioritize-sp500", action="store_true", default=True, 
                        help="Prioritize S&P 500 companies (default: True)")
    
    # Parallel processing options (added)
    parser.add_argument("--workers", type=int, default=Config.MAX_WORKERS, 
                        help=f"Number of parallel workers (default: {Config.MAX_WORKERS})")
    parser.add_argument("--batch-size", type=int, default=Config.BATCH_SIZE,
                        help=f"Number of filings to process in a batch (default: {Config.BATCH_SIZE})")
    
    # Check if running in notebook/IPython
    if is_running_in_notebook():
        # If in notebook, use default arguments
        args = parser.parse_args([])
    else:
        # If in regular Python, parse command line arguments
        args = parser.parse_args()
    
    # Apply arguments to Config if provided
    if args.input_dir:
        Config.INPUT_DIR = args.input_dir
    if args.output_file:
        Config.OUTPUT_FILE = args.output_file
    if args.max_products:
        Config.MAX_PRODUCTS = args.max_products
    if args.log_dir:
        Config.LOG_DIR = args.log_dir
    if args.workers:
        Config.MAX_WORKERS = args.workers
    if args.batch_size:
        Config.BATCH_SIZE = args.batch_size
        
    return args

# S&P 500 company identification functions
def get_sp500_tickers():
    """Return a set of S&P 500 tickers"""
    # Core S&P 500 companies (top 100 by market cap as of early 2025)
    sp500_core = {
        "AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "GOOG", "META", "BRK.B", "BRK-B",
        "LLY", "TSLA", "V", "UNH", "JPM", "XOM", "AVGO", "MA", "PG", "HD", 
        "COST", "MRK", "CVX", "ABBV", "KO", "PEP", "WMT", "ACN", "ADBE", 
        "MCD", "BAC", "CRM", "TXN", "LIN", "AMD", "TMO", "CSCO", "ABT", 
        "DHR", "CMCSA", "NKE", "INTC", "NFLX", "VZ", "PM", "WFC", "DIS", 
        "COP", "INTU", "IBM", "ORCL", "QCOM", "SPGI", "CAT", "GE", "HON", 
        "AMGN", "LOW", "AXP", "BA", "DE", "UPS", "RTX", "MS", "TJX", "GS", 
        "BLK", "PLD", "AMAT", "SBUX", "MDT", "CVS", "GILD", "SYK", "AMT", 
        "MDLZ", "ELV", "C", "ADI", "BKNG", "ADP", "ISRG", "MMC", "LRCX", 
        "TGT", "REGN", "VRTX", "CI", "SO", "PGR", "ZTS", "BSX", "CB", "MO"
    }
    
    return sp500_core

def is_sp500_company(ticker, company_name=None, json_data=None):
    """
    Determine if a company is in the S&P 500 using multiple methods:
    1. Check if ticker is in our predefined S&P 500 list
    2. Look for S&P 500 indication in JSON data if available
    3. Use company name to check for major S&P 500 companies
    """
    # Method 1: Check against our predefined list
    sp500_tickers = get_sp500_tickers()
    if ticker.upper() in sp500_tickers:
        return True
    
    # Method 2: Check JSON data if available
    if json_data:
        # Check for isSP500 field added by the downloader
        if isinstance(json_data, dict) and json_data.get("isSP500", False):
            return True
    
    # Method 3: Check company name for major companies
    major_companies = [
        "apple", "microsoft", "amazon", "nvidia", "google", "meta", "berkshire", 
        "eli lilly", "tesla", "visa", "unitedhealth", "jp morgan", "exxon", "broadcom"
    ]
    
    if company_name and any(company.lower() in company_name.lower() for company in major_companies):
        return True
    
    return False

class LLMClient:
    """Client for interacting with Chatbox LLM API"""
    
    def __init__(self, verbose: bool = False):
        self.base_url = Config.LLM_API_URL
        self.chat_path = Config.LLM_CHAT_PATH
        self.model = Config.LLM_MODEL
        self.verbose = verbose
        self.log_dir = None  # Added for compatibility with product extraction
        
        # Create a session for better performance with connection pooling (added)
        self._session = requests.Session()
    
    def test_connection(self) -> bool:
        """Test connection to the Chatbox LLM service"""
        try:
            if self.verbose:
                print(f"Testing connection to Chatbox at {self.base_url} using model {self.model}")
            
            # OpenAI-compatible API test
            test_payload = {
                "model": self.model,
                "messages": [
                    {"role": "user", "content": "Respond with 'Chatbox is working.'"}
                ],
                "temperature": 0.1
            }
            
            response = self._session.post(
                f"{self.base_url}{self.chat_path}", 
                json=test_payload, 
                headers={"Content-Type": "application/json"},
                timeout=10
            )
            
            if response.status_code == 200:
                if self.verbose:
                    print("✓ Successfully connected to Chatbox LLM")
                return True
            else:
                print(f"✗ Failed to connect to Chatbox LLM: {response.status_code}")
                print(f"  Response: {response.text[:200]}")
                
                # Suggest installation if model not found
                if "not found" in response.text:
                    print(f"\nTo use {self.model}, you might need to install it first:")
                    print(f"Run: 'ollama pull {self.model}'")
                    print("For help with other models, visit: https://ollama.com/library")
                
                return False
                
        except Exception as e:
            print(f"✗ Error connecting to Chatbox LLM: {str(e)}")
            print("  Make sure Chatbox is running and the API URL is correct")
            return False
    
    def query(self, prompt: str, system_prompt: str = None, temperature: float = 0.1) -> Dict[str, Any]:
        """
        Query the LLM with a prompt and optionally a system prompt
        Returns a dictionary with status and content/error
        """
        try:
            # Prepare the request
            messages = []
            
            # Add system prompt if provided
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            
            # Add user prompt
            messages.append({"role": "user", "content": prompt})
            
            # Prepare the API call
            payload = {
                "model": self.model,
                "messages": messages,
                "temperature": temperature  # Low temperature for more deterministic responses
            }
            
            # Make the API call
            response = self._session.post(
                f"{self.base_url}{self.chat_path}",
                json=payload,
                headers={"Content-Type": "application/json"},
                timeout=30  # Longer timeout for complex queries
            )
            
            # Process the response
            if response.status_code == 200:
                result = response.json()
                content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
                return {"status": "success", "content": content}
            else:
                error_msg = f"API error: {response.status_code}, {response.text[:200]}"
                if self.verbose:
                    print(error_msg)
                return {"status": "error", "error": error_msg}
                
        except Exception as e:
            error_msg = f"Query error: {str(e)}"
            if self.verbose:
                print(error_msg)
            return {"status": "error", "error": error_msg}
    
    def extract_json_from_llm_response(self, response_text: str) -> Dict[str, Any]:
        """Enhanced function to extract JSON data from LLM response with better error handling"""
        import json
        import re
        
        # Check for NO_PRODUCT_FOUND response
        if "NO_PRODUCT_FOUND" in response_text:
            if self.verbose:
                print("LLM explicitly reported NO_PRODUCT_FOUND")
            return {"product_found": False}
        
        # Debug log the response
        if self.verbose:
            print("\n----- LLM RESPONSE EXCERPT -----")
            print(response_text[:200] + "..." if len(response_text) > 200 else response_text)
            print("----- END EXCERPT -----\n")
        
        # Multi-stage JSON extraction approach
        
        # Stage 1: Try to find JSON using regex
        json_pattern = r'\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\}'
        json_matches = re.findall(json_pattern, response_text)
        
        if json_matches:
            # Sort by length to prioritize larger JSON objects, which are more likely to be our target
            json_matches.sort(key=len, reverse=True)
            
            for json_str in json_matches:
                try:
                    # Try to parse each potential JSON string
                    result = json.loads(json_str)
                    # If it contains product_found field, it's likely our target
                    if "product_found" in result:
                        if self.verbose:
                            print(f"✓ Successfully extracted JSON with product_found field")
                        return result
                    # If it has product_name, create a compatible structure
                    elif "product_name" in result:
                        if self.verbose:
                            print(f"✓ Found JSON with product_name but no product_found field, adding it")
                        result["product_found"] = True
                        return result
                except json.JSONDecodeError:
                    # Continue to next match if this one failed
                    continue
        
        # Stage 2: Look for JSON fields with regex
        product_found_match = re.search(r'"product_found"\s*:\s*(true|false)', response_text, re.IGNORECASE)
        product_name_match = re.search(r'"product_name"\s*:\s*"([^"]+)"', response_text)
        product_desc_match = re.search(r'"product_description"\s*:\s*"([^"]+)"', response_text)
        
        if product_name_match:
            # We found at least the product name, construct a result
            if self.verbose:
                print(f"⚠️ Constructing JSON from regex matches (no valid JSON object found)")
            
            # Default to true if product name is found but product_found field isn't
            product_found = True
            if product_found_match:
                product_found = product_found_match.group(1).lower() == "true"
            
            result = {
                "product_found": product_found,
                "product_name": product_name_match.group(1)
            }
            
            # Add description if found
            if product_desc_match:
                result["product_description"] = product_desc_match.group(1)
            else:
                result["product_description"] = ""
                
            return result
        
        # Stage 3: Full text analysis for product information
        if "new product" in response_text.lower() or "product announcement" in response_text.lower():
            # Attempt to extract product name from the response
            # Look for patterns like "Product X is..." or "announced Product X"
            product_patterns = [
                r'announced ([A-Z][A-Za-z0-9\s\-]+)(?:,|\.|;)',
                r'launched ([A-Z][A-Za-z0-9\s\-]+)(?:,|\.|;)',
                r'released ([A-Z][A-Za-z0-9\s\-]+)(?:,|\.|;)',
                r'new product(?:\s+called)?\s+([A-Z][A-Za-z0-9\s\-]+)(?:,|\.|;)',
                r'new ([A-Z][A-Za-z0-9\s\-]+)(?:\s+platform|\s+service|\s+solution|\s+product)(?:,|\.)'
            ]
            
            for pattern in product_patterns:
                match = re.search(pattern, response_text)
                if match:
                    product_name = match.group(1).strip()
                    if len(product_name) > 2 and len(product_name) < 100:  # Sanity check on length
                        if self.verbose:
                            print(f"⚠️ Extracted product name from text analysis: {product_name}")
                        return {
                            "product_found": True,
                            "product_name": product_name,
                            "product_description": "Description not available"
                        }
        
        # If everything failed, assume no product was found
        if self.verbose:
            print("❌ Failed to extract valid product JSON from response")
        return {"product_found": False}
    
    def validate_product(self, product_info: Dict[str, Any], full_text: str, company_name: str = "") -> bool:
        """Enhanced validation to catch valid products and filter out competitor products"""
        import re
        
        # Get product name
        product_name = product_info.get("product_name", "")
        if not product_name or len(product_name) < 2:
            if self.verbose:
                print(f"❌ Validation failed: Empty or too short product name")
            return False
        
        # Known competitor products to explicitly filter out
        known_competitor_products = [
            "apple vision pro", "iphone", "airpods", "macbook", "apple watch",
            "galaxy", "pixel", "surface", "playstation", "xbox", "nest", "alexa"
        ]
        
        # Check if this is a known competitor product
        for comp_product in known_competitor_products:
            if comp_product in product_name.lower():
                # Only reject if the company name doesn't match
                if "apple" not in company_name.lower() and (
                    "vision pro" in product_name.lower() or "iphone" in product_name.lower()
                ):
                    if self.verbose:
                        print(f"❌ Filtered out competitor product: '{product_name}' (belongs to Apple, not {company_name})")
                    return False
                if "samsung" not in company_name.lower() and "galaxy" in product_name.lower():
                    if self.verbose:
                        print(f"❌ Filtered out competitor product: '{product_name}' (belongs to Samsung, not {company_name})")
                    return False
                if "microsoft" not in company_name.lower() and "surface" in product_name.lower():
                    if self.verbose:
                        print(f"❌ Filtered out competitor product: '{product_name}' (belongs to Microsoft, not {company_name})")
                    return False
        
        # Filter out known false positives (keep this strict)
        known_false_positives = [
            "xbrl viewer", "edgar", "inline xbrl", "sec reporting", "sec filing",
            "financial statement", "quarterly report", "annual report", 
            "10-q", "10-k", "8-k", "form 8-k", "press release",
            "fiscal quarter", "fiscal year", "earnings release"
        ]
        
        if any(fp.lower() in product_name.lower() for fp in known_false_positives):
            if self.verbose:
                print(f"❌ Filtered out known false positive: '{product_name}'")
            return False
        
        # Financial terms check (can be less strict if needed)
        financial_terms = ["stock", "share", "bond", "note", "security", "securities", 
                           "dividend", "credit facility", "loan", "debt"]
        if any(term in product_name.lower() for term in financial_terms):
            if self.verbose:
                print(f"❌ Filtered out financial term: '{product_name}'")
            return False
        
        # OWNERSHIP CHECK: Stronger verification that this is the company's product
        
        # First, find instances of the product name in text
        context_windows = []
        
        # Prepare company name variations for checking
        company_terms = []
        if company_name:
            # Add the full company name
            company_terms.append(company_name.lower())
            
            # Add shortened versions if applicable (e.g., "Apple Inc." -> "Apple")
            parts = company_name.split()
            if len(parts) > 1:
                company_terms.append(parts[0].lower())
            
            # Remove common corporate suffixes for matching
            for suffix in [" inc", " corp", " corporation", " llc", " ltd", " co", " company"]:
                if company_name.lower().endswith(suffix):
                    company_terms.append(company_name.lower().replace(suffix, ""))
        
        # Create regex pattern for product name with word boundaries
        # Handle special characters in product names
        clean_product_name = re.escape(product_name.lower())
        pattern = r'\b' + clean_product_name + r'\b'
        
        try:
            for match in re.finditer(pattern, full_text.lower()):
                # Get 300 characters before and after the match
                start = max(0, match.start() - 300)
                end = min(len(full_text), match.end() + 300)
                context = full_text[start:end].lower()
                context_windows.append(context)
        except Exception as e:
            if self.verbose:
                print(f"Error in regex search: {e}")
            # Fallback to simpler search
            if product_name.lower() in full_text.lower():
                index = full_text.lower().find(product_name.lower())
                start = max(0, index - 300)
                end = min(len(full_text), index + len(product_name) + 300)
                context = full_text[start:end].lower()
                context_windows.append(context)
        
        # Ownership indicators - phrases that suggest the company owns the product
        ownership_indicators = [
            "our", "we", "the company", "introduced", "announced", "launched",
            "unveiled", "released", "developed", "created", "built",
            "proud to", "excited to", "pleased to", "happy to",
            "new", "latest", "next generation", "innovative"
        ]
        
        # Stronger competitor indicators - phrases that suggest it's NOT the company's product
        competitor_indicators = [
            "competitor", "competing", "rival", "alternative to",
            "similar to", "compared to", "versus", "vs.", "vs", "like",
            "such as", "other products", "other vendors", "competition"
        ]
        
        # Check each context window for ownership and competitor indicators
        for context in context_windows:
            # Check for company name near product mention
            company_mentioned = any(term in context for term in company_terms)
            
            # Check for ownership indicators
            ownership_found = any(indicator in context for indicator in ownership_indicators)
            
            # Check for competitor indicators
            competitor_indicator_found = any(indicator in context for indicator in competitor_indicators)
            
            if self.verbose:
                print(f"Product: {product_name}")
                print(f"Company mentioned in context: {company_mentioned}")
                print(f"Ownership indicator found: {ownership_found}")
                print(f"Competitor indicator found: {competitor_indicator_found}")
                
            # Strong positive signal: company name AND ownership indicator present
            if company_mentioned and ownership_found and not competitor_indicator_found:
                if self.verbose:
                    print(f"✓ Validated product '{product_name}' as belonging to {company_name}")
                return True
                
            # Strong negative signal: competitor indicator present
            if competitor_indicator_found:
                if self.verbose:
                    print(f"❌ Rejected product '{product_name}' due to competitor indicators")
                return False
        
        # If we've checked all contexts without a clear result, make a final decision
        # Default to more permissive validation to catch more products
        if len(context_windows) > 0:
            if self.verbose:
                print(f"⚠️ No strong ownership signals for '{product_name}' but mentioned in filing")
            return True
        
        # If no mentions found at all, reject
        if self.verbose:
            print(f"❌ Product '{product_name}' not clearly found in filing text")
        return False
    
    def query_llm_for_product(self, text: str, filing_date: str, company_name: str, ticker: str) -> Dict[str, Any]:
        """Enhanced query with improved prompt for better product detection"""
        import json
        import os
        
        # Add industry context based on company/ticker
        industry_context = self.add_industry_context(company_name, ticker)
        
        # Improved prompt with explicit instructions and examples for the specific company
        prompt = f"""
        Analyze this 8-K filing from {company_name} ({ticker}) filed on {filing_date}.

        TASK: Identify if there are any NEW PRODUCT or SERVICE ANNOUNCEMENTS BY {company_name} ITSELF in the text. Keep looking until you find new 
        product name.

        IMPORTANT ABOUT {company_name.upper()}: 
        1. ONLY extract products that are CLEARLY owned by {company_name} 
        2. DO NOT extract competitor products that are merely mentioned
        3. The product must be specifically ANNOUNCED or LAUNCHED in this filing
        4. {industry_context}

        OWNERSHIP IDENTIFICATION - Look for these indicators:
        - "our new product/solution/platform/service"
        - "we are launching/introducing/announcing"
        - "{company_name} today announced its new..."
        - "developed/created/built by {company_name}"
        - "the company is releasing/unveiling"

        EXAMPLES OF GOOD PRODUCT ANNOUNCEMENTS (For {company_name}):
        - "Today we announced [Product Name], our new enterprise platform"
        - "{company_name} is proud to introduce [Product Name]"
        - "We are excited to launch our newest offering, [Product Name]"
        - "The Company released [Product Name], a new solution for..."

        EXAMPLES OF WHAT NOT TO INCLUDE:
        - Competitor products that are merely mentioned
        - Products where ownership is unclear
        - Vague references to "new products" without specific names
        - SEC filing components like XBRL viewers or EDGAR systems
        - Discussion of existing products without announcing anything new
        - Financial instruments like bonds, shares, notes, or credit facilities

        If you find a new product announcement, extract:
        1. The EXACT product/service name as mentioned in the text (be precise and specific)
        2. A concise description of what it does (1-2 sentences)

        If no new products are announced, respond with exactly "NO_PRODUCT_FOUND".

        If a new product is found, respond with ONLY this JSON structure:
        {{
            "product_found": true,
            "product_name": "Exact name of the product",
            "product_description": "Brief description of what it does/is (max 180 chars)"
        }}
        """
        
        # System prompt to help the LLM stay on task
        system_prompt = f"""You are a specialized financial analyst who extracts product announcements from SEC 8-K filings.
        You are an expert at identifying when companies announce THEIR OWN new products/services.
        You can distinguish between a company's own products vs competitor products they mention.
        You understand {company_name}'s business and its industry.
        You avoid extracting any competitor products.
        You provide structured JSON output exactly in the requested format.
        When in doubt about whether something is a product, be conservative and respond with "NO_PRODUCT_FOUND"."""
        
        # Define chunk_size if not already defined
        chunk_size = getattr(self, 'chunk_size', 8000)
        
        # Query the LLM
        if self.verbose:
            print(f"Querying LLM with enhanced product detection prompt...")
        
        # Try with multiple temperature settings if needed
        result = self.query(prompt + "\n\nHere is the 8-K text:\n" + text[:chunk_size], system_prompt)
        
        if result["status"] == "success":
            content = result["content"]
            
            # Process the LLM response
            parsed_result = self.extract_json_from_llm_response(content)
            
            # For debugging - save raw response to a log file if log_dir is specified
            if self.log_dir:
                os.makedirs(self.log_dir, exist_ok=True)
                log_filename = f"{self.log_dir}/llm_response_{ticker}_{filing_date.replace('-', '')}.json"
                with open(log_filename, "w", encoding='utf-8') as log_file:
                    log_data = {
                        "ticker": ticker,
                        "filing_date": filing_date,
                        "prompt": prompt,
                        "response": content,
                        "parsed_result": parsed_result
                    }
                    json.dump(log_data, log_file, indent=2)
            
            # Return the parsed result
            return parsed_result
        else:
            return {"product_found": False, "error": result.get("error", "Unknown error")}
            
    def extract_multiple_products(self, text: str, filing_date: str, company_name: str, ticker: str, max_products: int = 3) -> List[Dict[str, Any]]:
        """Extract multiple products from a single filing"""
        products = []
        
        # Define chunk_size if not already defined
        chunk_size = getattr(self, 'chunk_size', 6000)
        
        # First attempt: try to get the main product
        main_product_info = self.query_llm_for_product(text, filing_date, company_name, ticker)
        
        if main_product_info.get("product_found", False):
            if self.validate_product(main_product_info, text, company_name):
                products.append(main_product_info)
        
        # If we want more than one product and found the first one, try to find additional ones
        if len(products) > 0 and max_products > 1:
            # Create a prompt specifically asking for additional products
            additional_prompt = f"""
            Analyze this 8-K filing from {company_name} ({ticker}) and identify ANY ADDITIONAL product 
            announcements DIFFERENT from: {main_product_info.get('product_name', '')}
            
            Focus ONLY on finding different products that were also announced in this filing.
            
            If you find another distinct product, respond with:
            {{
                "product_found": true,
                "product_name": "Name of the additional product",
                "product_description": "Brief description of what it does"
            }}
            
            If no additional products are found, respond with "NO_ADDITIONAL_PRODUCTS".
            """
            
            # Try to find additional products until we hit max_products
            attempts = 0
            max_attempts = max_products + 1  # Allow for some failures
            
            while len(products) < max_products and attempts < max_attempts:
                attempts += 1
                
                # Query for additional product
                result = self.query(additional_prompt + "\n\nHere is the 8-K text:\n" + text[:chunk_size])
                
                if result["status"] == "success":
                    content = result["content"]
                    
                    if "NO_ADDITIONAL_PRODUCTS" in content:
                        break
                    
                    # Parse response
                    additional_product = self.extract_json_from_llm_response(content)
                    
                    if additional_product.get("product_found", False):
                        # Make sure it's not a duplicate of already found products
                        new_product_name = additional_product.get("product_name", "").lower()
                        if not any(p.get("product_name", "").lower() == new_product_name for p in products):
                            # Validate the new product
                            if self.validate_product(additional_product, text, company_name):
                                products.append(additional_product)
                                # Update the prompt to exclude this product too
                                additional_prompt += f" and different from: {new_product_name}"
                else:
                    # Error with LLM, stop trying
                    break
        
        return products

    def add_industry_context(self, company_name: str, ticker: str) -> str:
        """Add industry-specific context to improve product recognition"""
        
        # Define industry mappings based on common tickers or keywords
        tech_companies = {"AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "IBM"}
        healthcare_companies = {"JNJ", "PFE", "MRK", "UNH", "ABBV", "LLY"}
        finance_companies = {"JPM", "BAC", "WFC", "C", "GS", "MS", "V", "MA"}
        consumer_companies = {"PG", "KO", "PEP", "MCD", "NKE", "SBUX"}
        industrial_companies = {"GE", "HON", "MMM", "CAT", "DE", "BA"}
        energy_companies = {"XOM", "CVX", "COP", "SLB", "EOG"}
        
        industry_context = ""
        
        if ticker.upper() in tech_companies or any(kw in company_name.lower() for kw in ["tech", "software", "electronics"]):
            industry_context = """
            In the technology industry, products can include:
            - Software applications, platforms, or services
            - Hardware devices or components
            - Cloud services or infrastructure
            - Cybersecurity solutions
            - AI/ML technologies or models
            - Data analytics platforms
            - Consumer electronics
            - Enterprise IT solutions
            """
        elif ticker.upper() in healthcare_companies or any(kw in company_name.lower() for kw in ["health", "pharma", "medical", "bio"]):
            industry_context = """
            In the healthcare industry, products can include:
            - Pharmaceutical drugs or treatments
            - Medical devices or equipment
            - Diagnostic tools or tests
            - Healthcare IT platforms
            - Telehealth solutions
            - Patient monitoring systems
            - Wellness or health management products
            """
        elif ticker.upper() in finance_companies or any(kw in company_name.lower() for kw in ["bank", "financial", "invest", "capital"]):
            industry_context = """
            In the financial industry, products can include:
            - Digital banking platforms
            - Payment processing systems
            - Investment or trading platforms
            - Wealth management tools
            - Financial planning software
            - Risk management systems
            - Lending or credit products (beyond basic loans)
            """
        elif ticker.upper() in consumer_companies or any(kw in company_name.lower() for kw in ["consumer", "retail", "food", "beverage"]):
            industry_context = """
            In the consumer goods industry, products can include:
            - Food and beverage items
            - Personal care products
            - Household goods
            - Apparel or footwear
            - Consumer electronics
            - Entertainment products
            - Subscription services
            """
        elif ticker.upper() in industrial_companies or any(kw in company_name.lower() for kw in ["industrial", "manufacturing", "machine"]):
            industry_context = """
            In the industrial sector, products can include:
            - Manufacturing equipment
            - Industrial machinery
            - Automation systems
            - Industrial components
            - Maintenance systems
            - Monitoring solutions
            - Safety equipment
            """
        elif ticker.upper() in energy_companies or any(kw in company_name.lower() for kw in ["energy", "oil", "gas", "power"]):
            industry_context = """
            In the energy sector, products can include:
            - Energy production systems
            - Resource extraction equipment
            - Energy management solutions
            - Renewable energy technologies
            - Power distribution systems
            - Efficiency optimization tools
            """
        else:
            # Generic context for other industries
            industry_context = """
            Products can take many forms depending on the industry:
            - Physical goods or merchandise
            - Software or digital services
            - Platforms or solutions
            - Tools or equipment
            - Systems or frameworks
            """
        
        return industry_context

def is_tech_company(company_name: str, ticker: str) -> bool:
    """Check if a company is likely a technology company based on name and ticker"""
    tech_keywords = [
        "tech", "software", "hardware", "semiconductor", "electronics", "digital", 
        "cyber", "cloud", "data", "ai", "artificial intelligence", "robot", 
        # ... rest of the keywords
    ]
    
    # List of well-known tech company tickers
    tech_tickers = {
        "AAPL", "MSFT", "GOOGL", "GOOG", "AMZN", "META", "NVDA", "TSLA", 
        # ... rest of the tickers
    }
    
    # Check ticker first (most reliable)
    if ticker.upper() in tech_tickers:
        return True
    
    # Check company name for tech keywords
    if any(keyword in company_name.lower() for keyword in tech_keywords):
        return True
        
    return False

class JSONProductExtractor:
    """Class to extract product announcements from JSON 8-K filings"""
    
    def __init__(self, llm_client: LLMClient, chunk_size: int = 6000, log_dir: str = None, verbose: bool = False):
        self.llm_client = llm_client
        self.chunk_size = chunk_size
        self.log_dir = log_dir
        self.verbose = verbose
        
        # Also set the log_dir for the LLM client
        self.llm_client.log_dir = log_dir
    
    def extract_text_from_json(self, json_data: Dict) -> str:
        """Enhanced method to extract relevant text content from JSON 8-K data"""
        import json
        import re
        from html.parser import HTMLParser
        
        extracted_text = []
        total_chars = 0
        
        # Add company name and filing date info
        company_name = json_data.get("companyName", "")
        if company_name:
            extracted_text.append(f"COMPANY: {company_name}")
        
        filing_date = json_data.get("filingDate", "")
        if filing_date:
            extracted_text.append(f"FILING DATE: {filing_date}")
        
        # Display debug info about top-level keys if verbose
        if self.verbose:
            print(f"Top-level JSON keys: {list(json_data.keys())}")
        
        # APPROACH 1: Try standard structured extraction first
        
        # Extract data from filingDetails if available
        if "filingDetails" in json_data and json_data["filingDetails"]:
            filing_details = json_data["filingDetails"]
            
            # Get form information
            form_info = filing_details.get("formInfo", {})
            form_desc = form_info.get("formDesc", "")
            if form_desc:
                extracted_text.append(f"FORM: {form_desc}")
            
            # Get document information and look for exhibits
            documents = filing_details.get("documents", [])
            
            for doc in documents:
                doc_type = doc.get("documentType", "")
                doc_desc = doc.get("documentDescription", "")
                doc_text = doc.get("text", "")
                
                # Focus on exhibits that might contain press releases
                is_exhibit = doc_type.startswith("EX") or "EXHIBIT" in doc_type.upper()
                
                if is_exhibit and doc_text:
                    extracted_text.append(f"\nEXHIBIT {doc_type}: {doc_desc}")
                    extracted_text.append(doc_text)
                    total_chars += len(doc_text)
                elif doc_type == "8-K" and doc_text:
                    extracted_text.append(f"\n8-K MAIN DOCUMENT:")
                    extracted_text.append(doc_text)
                    total_chars += len(doc_text)
        
        # Look for press release content specifically
        if "pressReleases" in json_data:
            press_releases = json_data.get("pressReleases", [])
            for pr_idx, pr in enumerate(press_releases):
                pr_title = pr.get("title", f"Press Release {pr_idx+1}")
                pr_text = pr.get("text", "")
                if pr_text:
                    extracted_text.append(f"\nPRESS RELEASE: {pr_title}")
                    extracted_text.append(pr_text)
                    total_chars += len(pr_text)
        
        # APPROACH 2: Try alternative common structures for SEC filings
        
        # Check for 'content' at top level (some providers use this)
        if "content" in json_data and isinstance(json_data["content"], str) and len(json_data["content"]) > 100:
            extracted_text.append("\nCONTENT:")
            extracted_text.append(json_data["content"])
            total_chars += len(json_data["content"])
        
        # Check for 'text' at top level
        if "text" in json_data and isinstance(json_data["text"], str) and len(json_data["text"]) > 100:
            extracted_text.append("\nTEXT:")
            extracted_text.append(json_data["text"])
            total_chars += len(json_data["text"])
        
        # Check for 'filing' field which sometimes contains the full filing
        if "filing" in json_data and isinstance(json_data["filing"], str) and len(json_data["filing"]) > 100:
            extracted_text.append("\nFILING TEXT:")
            extracted_text.append(json_data["filing"])
            total_chars += len(json_data["filing"])
        
        # Check for 'htmlDocument' which sometimes contains the full HTML filing
        if "htmlDocument" in json_data and isinstance(json_data["htmlDocument"], str) and len(json_data["htmlDocument"]) > 100:
            # Strip HTML tags for better text extraction
            class MLStripper(HTMLParser):
                def __init__(self):
                    super().__init__()
                    self.reset()
                    self.strict = False
                    self.convert_charrefs = True
                    self.text = []
                
                def handle_data(self, d):
                    self.text.append(d)
                
                def get_data(self):
                    return ''.join(self.text)
            
            def strip_tags(html):
                s = MLStripper()
                s.feed(html)
                return s.get_data()
            
            html_text = strip_tags(json_data["htmlDocument"])
            if len(html_text) > 100:
                extracted_text.append("\nHTML DOCUMENT TEXT:")
                extracted_text.append(html_text)
                total_chars += len(html_text)
        
        # APPROACH 3: Look for 'items' which might contain the 8-K items
        if "items" in json_data and isinstance(json_data["items"], list):
            for item in json_data["items"]:
                if isinstance(item, dict):
                    item_number = item.get("itemNumber", "Unknown")
                    item_text = item.get("itemText", "")
                    if item_text and len(item_text) > 100:
                        extracted_text.append(f"\nITEM {item_number}:")
                        extracted_text.append(item_text)
                        total_chars += len(item_text)
        
        # APPROACH 4: Deep recursive search for substantial text fields
        if total_chars < 1000:  # Only do this if we haven't found much text yet
            deep_text = self.deep_search_for_text(json_data)
            if deep_text:
                extracted_text.append("\nADDITIONAL TEXT:")
                extracted_text.append(deep_text)
                total_chars += len(deep_text)
        
        # If we still don't have enough text, try a last-ditch raw search
        if total_chars < 1000:
            json_str = json.dumps(json_data)
            large_text_pattern = r'"([^"]{100,})"'  # Find any JSON string values > 100 chars
            large_texts = re.findall(large_text_pattern, json_str)
            
            if large_texts:
                extracted_text.append("\nEXTRACTED LARGE TEXT FIELDS:")
                for text in large_texts[:10]:  # Limit to first 10 to avoid excessive output
                    extracted_text.append(text)
                    total_chars += len(text)
        
        # Debug info
        if self.verbose or total_chars < 1000:
            print(f"Total extracted text: {total_chars} characters from {len(extracted_text)} sections")
            if total_chars < 1000:
                print("WARNING: Limited text extracted, filing may have unusual structure")
        
        # Join all extracted text
        result = "\n\n".join(extracted_text)
        return result
    
    def deep_search_for_text(self, data, max_depth=5, current_depth=0, min_length=100) -> str:
        """Recursively search for substantial text content in complex JSON structures"""
        if current_depth > max_depth:
            return ""
        
        found_texts = []
        
        if isinstance(data, dict):
            for key, value in data.items():
                # Check if this value is a substantial text
                if isinstance(value, str) and len(value) > min_length:
                    found_texts.append(value)
                # Recursively search nested structures
                elif isinstance(value, (dict, list)):
                    nested_text = self.deep_search_for_text(value, max_depth, current_depth + 1, min_length)
                    if nested_text:
                        found_texts.append(nested_text)
        
        elif isinstance(data, list):
            for item in data:
                if isinstance(item, str) and len(item) > min_length:
                    found_texts.append(item)
                elif isinstance(item, (dict, list)):
                    nested_text = self.deep_search_for_text(item, max_depth, current_depth + 1, min_length)
                    if nested_text:
                        found_texts.append(nested_text)
        
        return "\n\n".join(found_texts)
    
    def get_relevant_sections(self, text: str, max_chars: int = None) -> str:
        """
        Enhanced section detection that targets product announcements in 8-K filings
        """
        import re
        
        if not max_chars:
            max_chars = self.chunk_size
        
        if self.verbose:
            print(f"Finding relevant sections in {len(text)} characters of text...")
        
        # Prioritize sections that commonly contain product info
        key_sections = []
        
        # Look for press release sections first
        press_release_pattern = r'(?i)(PRESS\s+RELEASE|NEWS\s+RELEASE).*?(?=ITEM|\Z)'
        pr_matches = re.findall(press_release_pattern, text, re.DOTALL)
        for match in pr_matches:
            key_sections.append(("PRESS RELEASE", match))
        
        # Look for Exhibit sections
        exhibit_pattern = r'(?i)(EXHIBIT\s+\d+\.\d+).*?(?=EXHIBIT|\Z)'
        ex_matches = re.findall(exhibit_pattern, text, re.DOTALL)
        for match in ex_matches:
            key_sections.append(("EXHIBIT", match))
        
        # Look for Form 8-K Item 7.01 (Regulation FD Disclosure) and 8.01 (Other Events)
        # These often contain material news
        item_pattern = r'(?i)(ITEM\s+7\.01|ITEM\s+8\.01).*?(?=ITEM|\Z)'
        item_matches = re.findall(item_pattern, text, re.DOTALL)
        for match in item_matches:
            key_sections.append(("ITEM", match))
        
        # Look for product-related keywords in any section
        product_keywords = [
            "new product", "launch", "introduce", "unveil", "announce", "release",
            "new service", "new platform", "new solution", "new offering",
            "next generation", "latest version"
        ]
        
        # Combine sections up to max_chars
        combined_text = ""
        added_sections = set()
        
        # First add sections with product keywords
        for keyword in product_keywords:
            if len(combined_text) >= max_chars * 0.8:  # Leave some room for other sections
                break
                
            # Find chunks of text containing the keyword
            positions = []
            for match in re.finditer(r'(?i)\b' + re.escape(keyword) + r'\b', text):
                start = max(0, match.start() - 1000)
                end = min(len(text), match.end() + 1000)
                positions.append((start, end))
            
            # Merge overlapping chunks
            if positions:
                positions.sort()
                merged = [positions[0]]
                for current in positions[1:]:
                    prev_start, prev_end = merged[-1]
                    curr_start, curr_end = current
                    if curr_start <= prev_end:
                        # Merge overlapping chunks
                        merged[-1] = (prev_start, max(prev_end, curr_end))
                    else:
                        merged.append(current)
                
                # Add merged chunks to combined text
                for start, end in merged:
                    chunk_id = f"{start}-{end}"
                    if chunk_id not in added_sections and len(combined_text) + (end - start) <= max_chars:
                        chunk = text[start:end]
                        combined_text += f"\n\n--- SECTION WITH KEYWORD '{keyword}' ---\n\n{chunk}\n\n"
                        added_sections.add(chunk_id)
        
        # Then add the prioritized sections
        for section_type, section in key_sections:
            section_id = hash(section[:100] + section[-100:])  # Use first and last 100 chars as ID
            if str(section_id) not in added_sections and len(combined_text) + len(section) <= max_chars:
                combined_text += f"\n\n--- {section_type} SECTION ---\n\n{section}\n\n"
                added_sections.add(str(section_id))
        
        # If no sections found or combined text is too small, use beginning of text plus samples
        if len(combined_text) < 2000:  # Minimum threshold for useful text
            if len(text) <= max_chars:
                # If text is small enough, use all of it
                combined_text = text
            else:
                # Create a hybrid approach: beginning + samples from throughout
                beginning = text[:int(max_chars * 0.6)]  # First 60% of allowed chars
                
                # Sample from different parts of the document
                remaining_chars = max_chars - len(beginning)
                
                # Take 3 samples from different parts of the document
                doc_length = len(text)
                sample_size = remaining_chars // 3
                
                middle_start = doc_length // 3
                middle_sample = text[middle_start:middle_start + sample_size]
                
                later_start = (doc_length * 2) // 3
                later_sample = text[later_start:later_start + sample_size]
                
                # Combine everything
                combined_text = f"{beginning}\n\n--- SAMPLE FROM MIDDLE ---\n\n{middle_sample}\n\n--- SAMPLE FROM LATER PART ---\n\n{later_sample}"
        
        if self.verbose:
            print(f"Found {len(combined_text)} characters of relevant text to analyze")
        
        return combined_text
    
    def validate_product(self, product_info: Dict[str, Any], full_text: str, company_name: str = "") -> bool:
        """Less strict validation to catch more valid products"""
        import re
        
        # Get product name
        product_name = product_info.get("product_name", "")
        if not product_name or len(product_name) < 2:
            return False
                
        # Filter out known false positives (keep this strict)
        known_false_positives = [
            "xbrl viewer", "edgar", "inline xbrl", "sec reporting", "sec filing",
            "financial statement", "quarterly report", "annual report", 
            "10-q", "10-k", "8-k", "form 8-k"
        ]
        
        if any(fp.lower() in product_name.lower() for fp in known_false_positives):
            print(f"⚠️ Filtered out known false positive: '{product_name}'")
            return False
        
        # Financial terms check (can be less strict if needed)
        financial_terms = ["stock", "share", "bond", "note", "security", "securities"]
        if any(term in product_name.lower() for term in financial_terms):
            return False
        
        # RELAXED: Verify the product name using fuzzy matching and word presence
        # This helps with minor variations in how the product is mentioned
        product_words = product_name.lower().split()
        significant_words = [w for w in product_words if len(w) > 3 and w not in ['the', 'and', 'for', 'with', 'from']]
        
        # If we have significant words to check
        if significant_words:
            # Check if most significant words appear somewhere in the text
            matches = sum(1 for word in significant_words if word in full_text.lower())
            match_ratio = matches / len(significant_words)
            
            # If most words match (>60%), consider it a match
            if match_ratio >= 0.6:
                # Do a more thorough check for company ownership if company_name provided
                if company_name:
                    # Look for ownership indicators within 500 chars of any significant word
                    for word in significant_words:
                        for match in re.finditer(r'\b' + re.escape(word) + r'\b', full_text.lower()):
                            match_start = max(0, match.start() - 500)
                            match_end = min(len(full_text), match.end() + 500)
                            context = full_text[match_start:match_end].lower()
                            
                            # EXPANDED ownership phrases to catch more cases
                            ownership_phrases = [
                                "our", "we", "company", 
                                "announced", "introducing", "launching", "unveiled", "released",
                                "proud to", "excited to", "pleased to", "happy to", 
                                "new", "latest", "next generation", "innovative",
                                company_name.lower(), "today", "introducing"
                            ]
                            
                            if any(phrase in context for phrase in ownership_phrases):
                                return True
                    
                    # If we've checked all words and contexts without finding ownership, reject
                    return False
                else:
                    # If no company name provided, just rely on the word matching
                    return True
        
        # Fall back to exact name check from original validation
        return product_name.lower() in full_text.lower()
    
    def process_filing(self, json_data: Dict, filing_meta: Dict[str, str]) -> Optional[Dict[str, str]]:
        """Process a single 8-K filing in JSON format and extract product information with multiple fallback approaches"""
        import re
        from typing import Optional, Dict, Any, List
        
        ticker = filing_meta.get("Ticker", "")
        filing_date = filing_meta.get("Filing Date", "")
        company_name = filing_meta.get("Company Name", ticker)  # Use ticker as fallback

        try:
            # Always show these steps regardless of verbose setting
            print(f"🔍 Analyzing filing for {ticker} ({filing_date})...")
        
            # Extract relevant text from JSON
            extracted_text = self.extract_text_from_json(json_data)
        
            # Report on data quality
            if len(extracted_text) > 1000:
                print(f"📄 Extracted {len(extracted_text)} characters of text for analysis")
            else:
                print(f"⚠️ Limited text extracted ({len(extracted_text)} characters). Results may be affected.")
        
            # Get most relevant sections using the enhanced method
            relevant_sections = self.get_relevant_sections(extracted_text)
        
            # Query LLM for product information
            print("🤖 Querying LLM to identify products...")
            
            # APPROACH 1: Standard product detection
            product_info = self.llm_client.query_llm_for_product(relevant_sections, filing_date, company_name, ticker)

            # Check if product was found
            if product_info.get("product_found", False):
                # Add validation here:
                if self.llm_client.validate_product(product_info, extracted_text, company_name):
                    product_name = product_info.get("product_name", "Unknown")
                    product_description = product_info.get("product_description", "")
        
                    # Always show when product is found, regardless of verbose setting
                    print(f"✓ Found product: {product_name} - By {company_name}")
        
                    # Return the product information
                    return {
                        "company_name": company_name,
                        "stock_name": ticker,
                        "filing_time": filing_date,
                        "new_product": product_name,
                        "product_description": product_description
                    }
                else:
                    print("❌ Product validation failed - likely false positive or competitor product")
            else:
                # APPROACH 2: Try with a simpler prompt and higher temperature
                print("🔄 First attempt found no products. Trying with alternative approach...")
                
                # Simplified prompt that focuses just on finding product names
                alt_prompt = f"""
                Analyze this 8-K filing from {company_name} ({ticker}) filed on {filing_date}.
                
                TASK: Find any NEW PRODUCTS or SERVICES announced by {company_name} in this filing.
                
                Focus on looking for:
                - Sentences containing "announced", "introduced", "launched", "unveiled", "released"
                - New products, services, platforms, or solutions created by {company_name}
                - Look especially in press release sections
                
                If you find ANY product announcement, respond with JSON:
                {{
                    "product_found": true,
                    "product_name": "Exact name of the product",
                    "product_description": "Brief description"
                }}
                
                If no products found, respond with exactly "NO_PRODUCT_FOUND".
                """
                
                # Use higher temperature for more exploratory results
                result = self.llm_client.query(alt_prompt + "\n\n" + relevant_sections, temperature=0.7)
                
                if result["status"] == "success":
                    content = result["content"]
                    alt_product_info = self.llm_client.extract_json_from_llm_response(content)
                    
                    if alt_product_info.get("product_found", False):
                        if self.llm_client.validate_product(alt_product_info, extracted_text, company_name):
                            product_name = alt_product_info.get("product_name", "Unknown")
                            product_description = alt_product_info.get("product_description", "")
                            
                            print(f"✓ Second attempt found product: {product_name} - By {company_name}")
                            
                            return {
                                "company_name": company_name,
                                "stock_name": ticker,
                                "filing_time": filing_date,
                                "new_product": product_name,
                                "product_description": product_description
                            }
                        else:
                            print("❌ Second attempt product validation failed")
                    else:
                        # APPROACH 3: Try pattern matching as last resort
                        print("🔍 Trying direct pattern matching for product mentions...")
                        
                        # Simple pattern matching for common product announcement structures
                        patterns = [
                            rf'(?i){re.escape(company_name)}(?:.*?)(?:announced|introduced|launched|unveiled) (?:its|their|the|our)?\s+new\s+([A-Z][A-Za-z0-9\s\-]+?)(?:,|\.)',
                            r'(?i)(?:we|our company)(?:.*?)(?:announced|introduced|launched|unveiled) (?:our|the)?\s+new\s+([A-Z][A-Za-z0-9\s\-]+?)(?:,|\.)',
                            r'(?i)introducing\s+(?:our|the)?\s+new\s+([A-Z][A-Za-z0-9\s\-]+?)(?:,|\.)',
                            r'(?i)new\s+(?:product|platform|solution|service)\s+(?:called|named)?\s+([A-Z][A-Za-z0-9\s\-]+?)(?:,|\.)'
                        ]
                        
                        for pattern in patterns:
                            matches = re.finditer(pattern, extracted_text)
                            for match in matches:
                                product_name = match.group(1).strip()
                                # Validate to prevent common false positives
                                if (product_name and 3 <= len(product_name) <= 50 and 
                                        not any(fp.lower() in product_name.lower() for fp in [
                                            "quarterly report", "annual report", "financial", "exhibit", 
                                            "10-q", "10-k", "8-k", "press release"
                                        ])):
                                    
                                    # Extract a description from context if possible
                                    start = max(0, match.start() - 100)
                                    end = min(len(extracted_text), match.end() + 200)
                                    context = extracted_text[start:end]
                                    
                                    print(f"✓ Found product via pattern matching: {product_name} - By {company_name}")
                                    
                                    return {
                                        "company_name": company_name,
                                        "stock_name": ticker,
                                        "filing_time": filing_date,
                                        "new_product": product_name,
                                        "product_description": f"Product found in context: {context[:180]}..."
                                    }
                
                # No product found in any approach
                print("❌ No products found by any method")
        
            return None
            
        except Exception as e:
            if self.verbose:
                print(f"Error processing {ticker} - {filing_date}: {str(e)}")
            return None

# Function to process a batch of filings in parallel (added)
def process_filing_batch(batch, llm_client, args):
    """Process a batch of filings and return results"""
    # Initialize product extractor
    product_extractor = JSONProductExtractor(
        llm_client=llm_client,
        chunk_size=args.chunk_size,
        log_dir=Config.LOG_DIR,
        verbose=args.verbose
    )
    
    products_found = []
    processed_files = []
    
    for filing_meta in batch:
        file_path = filing_meta.get("Local File Path", "")
        ticker = filing_meta.get("Ticker", "unknown")
        filing_date = filing_meta.get("Filing Date", "unknown")
        company_name = filing_meta.get("Company Name", ticker)
        
        # Determine if this is an S&P 500 company
        is_sp500 = is_sp500_company(ticker, company_name)
        
        if not os.path.exists(file_path):
            processed_files.append(file_path)
            continue
        
        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                try:
                    json_data = json.load(f)
                except json.JSONDecodeError:
                    processed_files.append(file_path)
                    continue
            
            # Process the filing
            product_info = product_extractor.process_filing(json_data, filing_meta)
            
            # If product found, track it
            if product_info:
                # Add S&P 500 status to product info
                product_info["is_sp500"] = is_sp500
                products_found.append(product_info)
            
            # Mark file as processed
            processed_files.append(file_path)
            
        except Exception as e:
            print(f"Error in worker processing {file_path}: {str(e)}")
            processed_files.append(file_path)
    
    return products_found, processed_files

def sort_filings_by_sp500_priority(filings_metadata):
    """Sort filings so S&P 500 companies come first"""
    sp500_filings = []
    other_filings = []
    
    for filing in filings_metadata:
        ticker = filing.get("Ticker", "")
        company_name = filing.get("Company Name", "")
        
        if is_sp500_company(ticker, company_name):
            sp500_filings.append(filing)
        else:
            other_filings.append(filing)
    
    # Log the counts for user information
    print(f"📊 Prioritizing S&P 500 companies: {len(sp500_filings)} S&P 500 filings, {len(other_filings)} other filings")
    
    # Return combined list with S&P 500 first
    return sp500_filings + other_filings

def main():
    """Main function to process JSON 8-K filings and extract product information"""
    
    # Show welcome banner
    print("\n" + "="*80)
    print("🔍 JSON 8-K PRODUCT FINDER 🔍")
    print("This tool analyzes SEC 8-K JSON filings to identify product announcements")
    print("="*80 + "\n")
    
    # Parse command line arguments
    args = parse_arguments()
    
    # Add a reset flag to command line arguments
    if hasattr(args, 'reset') and args.reset:
        if os.path.exists(Config.PROGRESS_FILE):
            os.remove(Config.PROGRESS_FILE)
            print("🔄 Reset progress: Starting from scratch")
    
    # Set default verbose mode for Jupyter environment
    if is_running_in_notebook() and not hasattr(args, 'verbose'):
        args.verbose = True
    
    # Load processing progress to continue from where we left off
    processed_file_paths = load_processing_progress()
    
    # Initialize LLM client with fixed Chatbox configuration
    print("Connecting to LLM service...")
    llm_client = LLMClient(verbose=args.verbose)
    
    # Test LLM connection
    if not llm_client.test_connection():
        print("\n❌ FAILED TO CONNECT TO LLM")
        print("Please check that:")
        print("1. Chatbox is running at http://127.0.0.1:11434")
        print(f"2. The {Config.LLM_MODEL} model is installed")
        print("3. You have internet access if pulling the model for the first time")
        return
    
    # If just testing, exit now
    if args.test_only:
        print("✅ LLM connection test successful. Exiting.")
        return
    
    # Create a log directory for storing raw responses if it doesn't exist
    if Config.LOG_DIR:
        os.makedirs(Config.LOG_DIR, exist_ok=True)
        print(f"📁 Created log directory: {Config.LOG_DIR}")
    
    # Read metadata from the CSV file
    filings_metadata = []
    csv_file = "8k_filings.csv"
    
    print("📊 Loading filing data...")
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            filings_metadata = list(reader)
        print(f"✅ Loaded metadata for {len(filings_metadata)} filings from {csv_file}")
    else:
        # If no CSV exists, scan the directory
        print("📂 No metadata CSV found. Scanning directory...")
        for filename in os.listdir(Config.INPUT_DIR):
            if filename.endswith(".json"):
                parts = filename.split('_')
                if len(parts) >= 3:
                    ticker = parts[0]
                    date_str = parts[1]
                    # Format date as YYYY-MM-DD
                    filing_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
                    filings_metadata.append({
                        "Ticker": ticker,
                        "Filing Date": filing_date,
                        "Local File Path": os.path.join(Config.INPUT_DIR, filename)
                    })
        print(f"✅ Found {len(filings_metadata)} filings in directory {Config.INPUT_DIR}")
    
    # Check if we have filings to process
    if not filings_metadata:
        print(f"\n❌ ERROR: No filings found to process.")
        print(f"Check that directory '{Config.INPUT_DIR}' contains 8-K JSON files.")
        return
    
    # Sort filings to prioritize S&P 500 companies if enabled
    if hasattr(args, 'prioritize_sp500') and args.prioritize_sp500:
        filings_metadata = sort_filings_by_sp500_priority(filings_metadata)
    
    # Create output file if it doesn't exist yet
    if not os.path.exists(Config.OUTPUT_FILE):
        with open(Config.OUTPUT_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["company_name", "stock_name", "filing_time", "new_product", "product_description", "is_sp500"])
        print(f"📄 Created output file: {Config.OUTPUT_FILE}")
    
    # Filter out filings that have been processed already
    filings_to_process = []
    skipped_count = 0
    for filing_meta in filings_metadata:
        file_path = filing_meta.get("Local File Path", "")
        if file_path in processed_file_paths:
            skipped_count += 1
        else:
            filings_to_process.append(filing_meta)
    
    if skipped_count > 0:
        print(f"⏩ Skipping {skipped_count} already processed filings")
        
    # Process each filing
    products_found = 0
    processed_count = 0
    sp500_processed = 0
    other_processed = 0
    sp500_products = 0
    other_products = 0
    newly_processed_files = set()
    
    print("\n" + "="*80)
    print(f"Starting to process {len(filings_to_process)} new filings...")
    print("="*80)
    
    # Create batches for parallel processing
    batch_size = Config.BATCH_SIZE
    filing_batches = [filings_to_process[i:i+batch_size] for i in range(0, len(filings_to_process), batch_size)]
    
    print(f"Processing {len(filing_batches)} batches of up to {batch_size} filings each")
    print(f"Using {Config.MAX_WORKERS} parallel workers")
    
    # Process batches
    for batch_idx, batch in enumerate(filing_batches):
        print(f"\nProcessing batch {batch_idx+1}/{len(filing_batches)} ({len(batch)} filings)")
        
        if Config.MAX_WORKERS > 1:
            # Parallel processing for this batch
            result_list = []
            
            with concurrent.futures.ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
                # Split the batch into sub-batches for each worker
                worker_batch_size = max(1, len(batch) // Config.MAX_WORKERS)
                worker_batches = [batch[i:i+worker_batch_size] for i in range(0, len(batch), worker_batch_size)]
                
                # Submit each sub-batch to a worker
                future_to_batch = {
                    executor.submit(process_filing_batch, worker_batch, llm_client, args): i 
                    for i, worker_batch in enumerate(worker_batches)
                }
                
                # Process results as they complete
                for future in concurrent.futures.as_completed(future_to_batch):
                    batch_num = future_to_batch[future]
                    try:
                        batch_products, batch_processed = future.result()
                        result_list.append((batch_products, batch_processed))
                    except Exception as e:
                        print(f"⚠️ Error in worker batch {batch_num}: {str(e)}")
            
            # Combine results from all workers
            for batch_products, batch_processed in result_list:
                for product_info in batch_products:
                    products_found += 1
                    
                    # Write to CSV file
                    with open(Config.OUTPUT_FILE, 'a', newline='', encoding='utf-8') as f:
                        writer = csv.writer(f)
                        writer.writerow([
                            product_info["company_name"],
                            product_info["stock_name"],
                            product_info["filing_time"],
                            product_info["new_product"],
                            product_info["product_description"],
                            "Yes" if product_info.get("is_sp500", False) else "No"
                        ])
                    
                    # Track S&P 500 vs. other company products
                    if product_info.get("is_sp500", False):
                        sp500_products += 1
                    else:
                        other_products += 1
                
                # Add to processed files
                for file_path in batch_processed:
                    newly_processed_files.add(file_path)
                    processed_file_paths.add(file_path)
                    
                    # Count processed files by S&P 500 status
                    for filing_meta in batch:
                        if filing_meta.get("Local File Path", "") == file_path:
                            ticker = filing_meta.get("Ticker", "")
                            company_name = filing_meta.get("Company Name", "")
                            if is_sp500_company(ticker, company_name):
                                sp500_processed += 1
                            else:
                                other_processed += 1
                            break
                
                processed_count += len(batch_processed)
        else:
            # Sequential processing (original behavior)
            product_extractor = JSONProductExtractor(
                llm_client=llm_client,
                chunk_size=args.chunk_size,
                log_dir=Config.LOG_DIR,
                verbose=args.verbose
            )
            
            # Process each filing in the batch sequentially
            for filing_meta in batch:
                file_path = filing_meta.get("Local File Path", "")
                ticker = filing_meta.get("Ticker", "")
                filing_date = filing_meta.get("Filing Date", "")
                company_name = filing_meta.get("Company Name", ticker)
                
                # Determine if this is an S&P 500 company
                is_sp500 = is_sp500_company(ticker, company_name)
                sp500_label = "[S&P 500]" if is_sp500 else ""
                
                if not os.path.exists(file_path):
                    print(f"Warning: File not found: {file_path}")
                    newly_processed_files.add(file_path)
                    processed_file_paths.add(file_path)
                    processed_count += 1
                    continue
                
                try:
                    # Read the JSON file
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        try:
                            json_data = json.load(f)
                        except json.JSONDecodeError as e:
                            print(f"Error: Could not parse JSON file {file_path}: {str(e)}")
                            newly_processed_files.add(file_path)
                            processed_file_paths.add(file_path)
                            processed_count += 1
                            continue
                    
                    # Process the filing
                    product_info = product_extractor.process_filing(json_data, filing_meta)
                    
                    # If product found, write to CSV and show notification
                    if product_info:
                        # Add S&P 500 status to product info
                        product_info["is_sp500"] = is_sp500
                        
                        with open(Config.OUTPUT_FILE, 'a', newline='', encoding='utf-8') as f:
                            writer = csv.writer(f)
                            writer.writerow([
                                product_info["company_name"],
                                product_info["stock_name"],
                                product_info["filing_time"],
                                product_info["new_product"],
                                product_info["product_description"],
                                "Yes" if is_sp500 else "No"
                            ])
                        
                        products_found += 1
                        
                        # Track S&P 500 vs. other company products
                        if is_sp500:
                            sp500_products += 1
                        else:
                            other_products += 1
                        
                        # Show a visible notification
                        print("\n" + "*"*50)
                        print(f"✅ PRODUCT FOUND #{products_found}: {product_info['new_product']}")
                        print(f"   Company: {product_info['company_name']} {sp500_label}")
                        print(f"   Description: {product_info['product_description']}")
                        print("*"*50 + "\n")
                    
                    # Count processed files by S&P 500 status
                    if is_sp500:
                        sp500_processed += 1
                    else:
                        other_processed += 1
                        
                    # Mark this file as processed
                    newly_processed_files.add(file_path)
                    processed_file_paths.add(file_path)
                    processed_count += 1
                    
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")
                    newly_processed_files.add(file_path)
                    processed_file_paths.add(file_path)
                    processed_count += 1
        
        # Save progress after each batch
        save_processing_progress(processed_file_paths)
        
        # Show batch stats
        print(f"Batch {batch_idx+1} completed:")
        print(f"- Products found: {products_found} ({sp500_products} from S&P 500)")
        print(f"- Filings processed: {processed_count}/{len(filings_to_process)}")
        
        # Exit if we've found the maximum number of products
        if products_found >= Config.MAX_PRODUCTS:
            print(f"\nReached maximum of {Config.MAX_PRODUCTS} products. Stopping.")
            break
    
    # Save final progress
    save_processing_progress(processed_file_paths)
    
    # Count existing products in output file
    existing_products = 0
    existing_sp500_products = 0
    existing_other_products = 0
    
    if os.path.exists(Config.OUTPUT_FILE):
        try:
            with open(Config.OUTPUT_FILE, 'r', newline='', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    existing_products += 1
                    # If we have S&P 500 status in the file, count it
                    if "is_sp500" in row and row["is_sp500"].lower() == "yes":
                        existing_sp500_products += 1
                    else:
                        existing_other_products += 1
        except Exception as e:
            print(f"⚠️ Error reading existing products: {str(e)}")
            
    if existing_products > 0:
        print(f"📊 Found {existing_products} products already in output file")

    # Show final results with clear formatting
    print("\n" + "="*80)
    print("📊 PROCESSING RESULTS 📊")
    print("="*80)
    print(f"✅ New filings processed:  {processed_count}/{len(filings_to_process)}")
    print(f"   - S&P 500 companies:   {sp500_processed}")
    print(f"   - Other companies:     {other_processed}")
    print(f"🎯 New products found:     {products_found}")
    print(f"   - From S&P 500:        {sp500_products}")
    print(f"   - From other:          {other_products}")
    print(f"📚 Total products found:   {existing_products}")
    print(f"   - From S&P 500:        {existing_sp500_products}")
    print(f"   - From other:          {existing_other_products}")
    print(f"📄 Results saved to:       {Config.OUTPUT_FILE}")
    
    # Show product list if any found
    if existing_products > 0:
        print("\n📋 PRODUCT SUMMARY (most recent first):")
        print("-"*70)
        try:
            with open(Config.OUTPUT_FILE, 'r', newline='', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                # Read all rows and reverse to show most recent first
                rows = list(reader)
                rows.reverse()
                
                # Show last 10 products or all if less than 10
                display_count = min(10, len(rows))
                print(f"Showing {display_count} most recent products:")
                
                for i, row in enumerate(rows[:display_count], 1):
                    # Indicate if S&P 500
                    is_sp500_str = "[S&P 500]" if "is_sp500" in row and row["is_sp500"].lower() == "yes" else ""
                    print(f"{i}. {row['new_product']} - {row['stock_name']} {is_sp500_str} ({row['filing_time']})")
                
                if len(rows) > display_count:
                    print(f"...and {len(rows) - display_count} more (total: {len(rows)} products)")
        except Exception as e:
            print(f"Error reading results file: {str(e)}")
    
    print("\n✅ Processing complete!")

# This allows the script to be run from both command line and Jupyter/IPython
def run_main(reset=False, prioritize_sp500=True, workers=Config.MAX_WORKERS, batch_size=Config.BATCH_SIZE):
    """
    Run the main function with options
    
    Args:
        reset (bool): If True, reset processing progress and start from scratch
        prioritize_sp500 (bool): If True, prioritize S&P 500 companies
        workers (int): Number of parallel workers to use
        batch_size (int): Number of filings to process in a batch
    """
    # Set up basic logging
    sys.argv = [sys.argv[0]] if len(sys.argv) >= 1 else ['']
    
    if reset:
        sys.argv.append('--reset')
    
    if prioritize_sp500:
        sys.argv.append('--prioritize-sp500')
    
    # Add parallel processing parameters
    sys.argv.extend(['--workers', str(workers)])
    sys.argv.extend(['--batch-size', str(batch_size)])
        
    print(f"Starting JSON 8-K Product Finder with {workers} workers and batch size {batch_size}...")
    main()

if __name__ == "__main__":
    run_main()

# Add your custom configuration here, outside and after the if __name__ block
    run_main(workers=4, batch_size=4)

Starting JSON 8-K Product Finder with 4 workers and batch size 4...

🔍 JSON 8-K PRODUCT FINDER 🔍
This tool analyzes SEC 8-K JSON filings to identify product announcements

📋 Loaded progress data: 5073 filings already processed
Connecting to LLM service...
📁 Created log directory: llm_logs
📊 Loading filing data...
✅ Loaded metadata for 16446 filings from 8k_filings.csv
📊 Prioritizing S&P 500 companies: 646 S&P 500 filings, 15800 other filings
⏩ Skipping 8571 already processed filings

Starting to process 7875 new filings...
Processing 1969 batches of up to 4 filings each
Using 4 parallel workers

Processing batch 1/1969 (4 filings)
🔍 Analyzing filing for ADC (2024-10-02)...
📄 Extracted 252426 characters of text for analysis
🤖 Querying LLM to identify products...
🔍 Analyzing filing for ADC (2024-10-28)...
📄 Extracted 1031416 characters of text for analysis
🔍 Analyzing filing for ADC (2024-10-25)...
📄 Extracted 1087252 characters of text for analysis
🤖 Querying LLM to identify products...