In [None]:
import json
import re
import os
from tqdm.notebook import tqdm # Use notebook version for progress bar


In [None]:
# --- Configuration ---

# Choose 'news' or 'reddit' to process
data_source = 'reddit' # Or 'news'

if data_source == 'reddit':
    # Input: Labeled data from OpenAI (without spans)
    input_json_path = "../../../Data/Historical Reddit/NER_Data/Labeled/ner_labeled_reddit_dataset.json"
    # Output: Labeled data ready for spaCy training (with spans)
    output_json_path = "../../../Data/Historical Reddit/NER_Data/Labeled/ner_labeled_reddit_dataset_spacy.json"
elif data_source == 'news':
    # Input: Labeled data from OpenAI (without spans)
    input_json_path = "../../../Data/Historical News/NER_Data/Labeled/ner_labeled_news_dataset.json"
    # Output: Labeled data ready for spaCy training (with spans)
    output_json_path = "../../../Data/Historical News/NER_Data/Labeled/ner_labeled_news_dataset_spacy.json"
else:
    raise ValueError("data_source must be 'reddit' or 'news'")

print(f"Processing source: {data_source}")
print(f"Input path: {input_json_path}")
print(f"Output path: {output_json_path}")

# Ensure output directory exists
os.makedirs(os.path.dirname(output_json_path), exist_ok=True)


In [None]:
# Load the labeled data
if not os.path.exists(input_json_path):
    print(f"Error: Input file not found at {input_json_path}")
    # Or raise FileNotFoundError("Input file not found...")
    data = None # Indicate data wasn't loaded
else:
    print(f"Loading data from {input_json_path}...")
    try:
        with open(input_json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} articles.")
    except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {input_json_path}: {e}")
            data = None
    except Exception as e:
        print(f"An unexpected error occurred loading {input_json_path}: {e}")
        data = None

# Proceed only if data was loaded successfully
if data is None:
    print("Cannot proceed without loaded data.")


In [None]:
if data is not None: # Check if data loaded successfully
    processed_data_with_spans = []
    total_entities_processed = 0
    total_find_errors = 0 # Errors finding *any* match
    total_assignment_errors = 0 # Errors finding an *unassigned* match during fallback
    total_validation_errors = 0 # Errors where found span text doesn't match entity text exactly (after sequential check failed)


    print("\nProcessing articles to find unique entity spans...")
    # Use tqdm for progress bar over articles
    for article_index, article in enumerate(tqdm(data, desc="Processing Articles")):
        article_id = article.get("id")
        article_text = article.get("text", "")
        # Ensure entities is a list, default to empty list if missing or null
        entities = article.get("entities") if isinstance(article.get("entities"), list) else []

        if not article_text:
                print(f"Warning [{article_id}]: Skipping article with empty text.")
                processed_data_with_spans.append({
                    "id": article_id,
                    "text": article_text,
                    "entities": [] # Ensure entities key exists even if skipped
                })
                continue

        processed_entities = []
        assigned_spans = set() # Keep track of (start, end) tuples already assigned
        search_start_pos = 0 # Position to start the next sequential search

        for entity_index, entity in enumerate(entities):
            # Ensure entity is a dictionary
            if not isinstance(entity, dict):
                    print(f"Warning [{article_id}/{entity_index}]: Skipping invalid entity format (not a dictionary): {entity}")
                    continue

            entity_text = entity.get("text")
            entity_label = entity.get("label")

            # Validate entity text and label
            if not isinstance(entity_text, str) or not entity_text or not isinstance(entity_label, str) or not entity_label:
                print(f"Warning [{article_id}/{entity_index}]: Skipping entity with missing/invalid text ('{entity_text}') or label ('{entity_label}').")
                continue

            total_entities_processed += 1
            assigned_this_entity = False # Track if the current entity was successfully assigned

            try:
                # Escape potential regex special characters
                escaped_entity_text = re.escape(entity_text)
                match = None
                potential_sequential_span = None

                # --- Pre-check for exact match existence ---
                # This helps decide later if a case-insensitive match is acceptable
                # Use word boundaries (\b) for potentially more robust matching, esp. for shorter entities
                # However, \b might fail if entity starts/ends with non-alphanumeric, so use carefully
                # Let's stick to simple search for now unless problems arise.
                # has_exact_match_anywhere = re.search(r'\b' + escaped_entity_text + r'\b', article_text) is not None
                has_exact_match_anywhere = re.search(escaped_entity_text, article_text) is not None


                # 1. Try sequential search (case-sensitive, exact match)
                # Search from the last known end position to prioritize sequential flow
                if search_start_pos < len(article_text):
                    match_cs = re.search(escaped_entity_text, article_text[search_start_pos:])
                    if match_cs:
                        start_char = match_cs.start() + search_start_pos
                        end_char = match_cs.end() + search_start_pos
                        span = (start_char, end_char)
                        # Crucially check if the found text is EXACTLY the entity text
                        if article_text[start_char:end_char] == entity_text and span not in assigned_spans:
                            processed_entities.append({
                                "text": entity_text, # Use original entity text
                                "label": entity_label,
                                "start": start_char,
                                "end": end_char
                            })
                            assigned_spans.add(span)
                            search_start_pos = end_char # Update sequential search position
                            assigned_this_entity = True

                # 2. Fallback: If sequential exact match failed, search entire text for first available EXACT match
                if not assigned_this_entity:
                    matches_cs_all = list(re.finditer(escaped_entity_text, article_text))
                    found_fallback_exact = False
                    for m in matches_cs_all:
                        start_char = m.start()
                        end_char = m.end()
                        span = (start_char, end_char)
                        # Check for exact text match and availability
                        if article_text[start_char:end_char] == entity_text and span not in assigned_spans:
                            processed_entities.append({
                                "text": entity_text, # Use original entity text
                                "label": entity_label,
                                "start": start_char,
                                "end": end_char
                            })
                            assigned_spans.add(span)
                            # DO NOT update search_start_pos here, as we broke sequential flow
                            assigned_this_entity = True
                            found_fallback_exact = True
                            break # Found first available exact match

                # 3. Fallback 2: If NO exact match was assignable, try case-insensitive IF no exact match exists ANYWHERE
                if not assigned_this_entity and not has_exact_match_anywhere:
                    matches_ci_all = list(re.finditer(escaped_entity_text, article_text, re.IGNORECASE))
                    found_fallback_ci = False
                    for m in matches_ci_all:
                            start_char = m.start()
                            end_char = m.end()
                            span = (start_char, end_char)
                            matched_text_in_article = article_text[start_char:end_char]
                            # Check length and availability, accept case difference ONLY if no exact match existed
                            if len(matched_text_in_article) == len(entity_text) and span not in assigned_spans:
                                processed_entities.append({
                                    "text": matched_text_in_article, # Use the text found in the article
                                    "label": entity_label,
                                    "start": start_char,
                                    "end": end_char
                                })
                                assigned_spans.add(span)
                                assigned_this_entity = True
                                found_fallback_ci = True
                                print(f"Info [{article_id}/{entity_index}]: Used case-insensitive match '{matched_text_in_article}' for entity '{entity_text}' at {span} (no exact match found anywhere).")
                                break # Found first available case-insensitive match

                # If still not assigned after all attempts
                if not assigned_this_entity:
                    # Determine reason for logging
                    if not list(re.finditer(escaped_entity_text, article_text, re.IGNORECASE)):
                        print(f"Error [{article_id}/{entity_index}]: Could not find any occurrence (CS/CI) of entity text '{entity_text}'. Skipping.")
                        total_find_errors += 1
                    else:
                        # It exists, but all occurrences were already assigned or failed validation (e.g., case mismatch when exact existed)
                        print(f"Warning [{article_id}/{entity_index}]: Could not assign a span for entity text '{entity_text}'. All occurrences might be assigned or invalid (e.g. case mismatch). Skipping.")
                        total_assignment_errors += 1


            except re.error as e:
                print(f"Regex Error [{article_id}/{entity_index}] for entity '{entity_text}': {e}")
                total_find_errors += 1 # Count regex errors as find errors
            except Exception as e:
                print(f"Unexpected Error processing entity [{article_id}/{entity_index}] '{entity_text}': {e}")
                # Depending on severity, you might count this differently or re-raise


        # Add the article with processed entities (including spans) to the output list
        processed_data_with_spans.append({
            "id": article_id,
            "text": article_text,
            # Sort processed entities by start index for consistency before saving
            "entities": sorted(processed_entities, key=lambda x: x.get('start', -1))
        })
    print("\nFinished processing articles.") # Add completion message for the loop
# End of `if data is not None:` block


In [None]:
if data is not None: # Only save and print summary if processing happened
    # Save the processed data with spans to the output file
    print(f"\nSaving data with spans to {output_json_path}...")
    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(processed_data_with_spans, f, indent=2, ensure_ascii=False)
        print("Save successful.")
    except Exception as e:
        print(f"ERROR: Failed to save output file {output_json_path}: {e}")


    print("\n--- Processing Summary ---")
    print(f"Total articles loaded: {len(data)}")
    print(f"Total entities processed (attempted): {total_entities_processed}")
    total_assigned = sum(len(art['entities']) for art in processed_data_with_spans)
    print(f"Total entities successfully assigned spans: {total_assigned}")
    print(f"  - Errors finding any occurrence: {total_find_errors}")
    print(f"  - Warnings (occurrence found but could not be assigned): {total_assignment_errors}")
    # print(f"Total validation errors (found span text mismatch - sequential only): {total_validation_errors}") # Removed this metric as logic changed

    if total_entities_processed > 0:
        # Consider find errors and assignment warnings as issues
        total_issues = total_find_errors + total_assignment_errors
        issue_rate = (total_issues / total_entities_processed) * 100
        success_rate = (total_assigned / total_entities_processed) * 100
        print(f"Overall entity assignment success rate: {success_rate:.2f}%")
        print(f"Overall entity assignment issue rate: {issue_rate:.2f}%")
    print(f"Output saved to {output_json_path}")
else:
    print("\nNo data was processed or saved due to loading errors.")
