In [None]:
# Base Imports
import spacy
import json
import os
import random
import itertools
import shutil
import traceback
import time # Added for timing

# spaCy Specific Imports
from spacy.util import compounding, minibatch
from tqdm.notebook import tqdm # Use notebook version
from spacy.tokens import DocBin
from spacy.training.example import Example
from spacy.scorer import Scorer


In [None]:
def convert_json_to_spacy_blank(json_file, output_file):
    """
    Convert JSON labeled data to SpaCy's binary format — no label remapping (for blank model training).
    Uses strict alignment, checks for overlaps, and reports stats.
    """
    if not os.path.exists(json_file):
            print(f"ERROR: Input JSON file not found: {json_file}")
            raise FileNotFoundError(f"Input JSON file not found: {json_file}")

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    nlp = spacy.blank("en")  # Blank model for tokenization
    db = DocBin()
    labels_used = set()
    skipped_docs = 0
    total_entities_found = 0
    skipped_entities = 0

    print(f"Converting data from {json_file} to SpaCy format for blank model...")
    for item in tqdm(data, desc="Converting JSON (Blank)"):
        text = item.get("text", "")
        entities = item.get("entities", [])
        if not text:
            # print("\nWarning: Skipping item with empty text.") # Reduce verbosity
            skipped_docs += 1
            continue
        total_entities_found += len(entities)

        try:
            doc = nlp.make_doc(text)
        except Exception as e:
            print(f"\nWarning: Skipping doc due to nlp.make_doc error: {e}. Text: '{text[:50]}...'")
            skipped_docs += 1
            skipped_entities += len(entities)
            continue

        ents = []
        seen_tokens = set()
        for ent in entities:
            label = ent.get("label")
            start = ent.get("start")
            end = ent.get("end")

            if label is None or start is None or end is None:
                # print(f"\nWarning: Skipping entity with missing fields {ent} in text: '{text[:50]}...'")
                skipped_entities += 1
                continue

            labels_used.add(label) # No mapping needed for blank model

            try:
                # Use strict alignment first
                span = doc.char_span(start, end, label=label, alignment_mode="strict")
                if span is None:
                    # print(f"\nWarning: Skipping entity {ent} (label: {label}) due to invalid span (alignment failed) in text: '{text[max(0, start-10):min(len(text), end+10)]}...' text-len: {len(text)}")
                    skipped_entities += 1
                    continue # Skip this entity
            except Exception as e:
                    print(f"\nWarning: Error creating span for entity {ent} (label: {label}) in text: '{text[:50]}...'. Error: {e}")
                    skipped_entities += 1
                    span = None

            if span is None:
                continue # Skip if span creation failed

            # Check for overlapping tokens
            token_indices = {tok.i for tok in span}
            if token_indices.intersection(seen_tokens):
                # print(f"\nWarning: Skipping entity {ent} label='{span.label_}' text='{span.text}' due to overlapping tokens in text: '{text[max(0, span.start_char-10):min(len(text), span.end_char+10)]}...'" )
                skipped_entities += 1
                continue # Skip entity

            ents.append(span)
            seen_tokens.update(token_indices)

        # Only add the doc if it's considered valid
        try:
            ents.sort(key=lambda s: s.start_char) # Sort ents
            doc.ents = ents
            db.add(doc)
        except ValueError as e: # Catches potential overlap errors spaCy might find
            print(f"\nSkipping doc due to ValueError during final assignment (likely overlap): {e}. Text: '{text[:50]}...'")
            print(f"Entities attempted: {[(e.start_char, e.end_char, e.label_) for e in ents] if ents else '[]'}")
            skipped_docs += 1
            skipped_entities += len(ents) # Count entities in the skipped doc as skipped
        except Exception as e:
            print(f"\nSkipping doc due to unexpected error during final assignment: {e}. Text: '{text[:50]}...'")
            skipped_docs += 1
            skipped_entities += len(ents)

    db.to_disk(output_file)
    total_docs = len(data)
    saved_docs = len(db)
    print(f"\n✅ Saved {saved_docs}/{total_docs} documents to {output_file}")
    if skipped_docs > 0:
        print(f"  Skipped {skipped_docs} documents due to errors during conversion.")
    print(f"🧾 Labels used in conversion: {sorted(labels_used)}")

    # Print entity statistics
    processed_entities = total_entities_found - skipped_entities
    print(f"\n📊 Entity Conversion Stats:")
    print(f"  Total entities in JSON: {total_entities_found}")
    print(f"  Skipped entities (invalid/overlap/error): {skipped_entities}")
    print(f"  Entities successfully converted: {processed_entities}")
    if total_entities_found > 0:
        success_rate = (processed_entities / total_entities_found) * 100
        print(f"  Entity success rate: {success_rate:.2f}%")
    else:
        print("  No entities found to calculate success rate.")


In [None]:
# This function is identical to the one in 01a_spacy_pretrained_training.ipynb
# It's included here for completeness of the notebook.
def split_data(full_data_path, train_ratio=0.9, prefix="blank"): # Default prefix changed
    """
    Split data from a full .spacy file into training and test sets based on train_ratio.
    Returns paths to temporary train and test files, using a prefix for uniqueness.
    """
    if not os.path.exists(full_data_path):
            raise FileNotFoundError(f"Full data file not found: {full_data_path}")

    try:
        doc_bin = DocBin().from_disk(full_data_path)
        # Use blank model vocab for splitting
        data = list(doc_bin.get_docs(spacy.blank("en").vocab))
    except Exception as e:
        raise ValueError(f"Failed to load documents from {full_data_path}. Check file integrity. Error: {e}")

    if not data:
        raise ValueError(f"No documents loaded from {full_data_path}.")

    random.shuffle(data)
    split_point = int(len(data) * train_ratio)
    train_data = data[:split_point]
    test_data = data[split_point:]

    if not train_data or not test_data:
            raise ValueError(f"Could not split data. Train size: {len(train_data)}, Test size: {len(test_data)}. Check train_ratio ({train_ratio}).")

    # Use temporary file names with prefix in the current working directory
    train_path = f"temp_train_data_{prefix}.spacy"
    test_path = f"temp_test_data_{prefix}.spacy"

    try:
        DocBin(docs=train_data).to_disk(train_path)
        DocBin(docs=test_data).to_disk(test_path)
    except Exception as e:
        # Clean up files if writing fails
        if os.path.exists(train_path): os.remove(train_path)
        if os.path.exists(test_path): os.remove(test_path)
        raise IOError(f"Failed to write temporary split data files ('{train_path}', '{test_path}'). Error: {e}")

    print(f"Split data with ratio {train_ratio}: {len(train_data)} train, {len(test_data)} test (prefix: {prefix})")
    return train_path, test_path


In [None]:
def train_blank_spacy_model(train_data_path, output_model_path=None, n_iter=10,
                            dropout=0.1, batch_start=4.0, batch_end=32.0, batch_compound=1.5):
    """
    Train a SpaCy NER model from a blank English model.
    """
    print("\n--- Training Model (Base: blank 'en') ---")
    print(f"Params: n_iter={n_iter}, dropout={dropout}, batch=({batch_start}, {batch_end}, {batch_compound})")
    print("🧼 Loading blank English model...")
    nlp = spacy.blank("en")

    # Add NER pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
        # print("Added 'ner' pipeline to blank model.") # Less verbose
    else:
        ner = nlp.get_pipe("ner")

    # Load training data
    if not os.path.exists(train_data_path):
            raise FileNotFoundError(f"Training data file not found: {train_data_path}")
    try:
        doc_bin = DocBin().from_disk(train_data_path)
        train_data = list(doc_bin.get_docs(nlp.vocab))
    except Exception as e:
        raise ValueError(f"Failed to load training documents from {train_data_path}. Error: {e}")

    if not train_data:
        raise ValueError(f"No training data loaded from {train_data_path}.")

    # Add entity labels dynamically from the training data
    labels = {ent.label_ for doc in train_data if doc.ents for ent in doc.ents}
    if not labels:
        print("Warning: No entity labels found in the training data!")
    else:
        # print(f"Found labels in training data: {sorted(list(labels))}") # Less verbose
        for label in sorted(list(labels)):
            ner.add_label(label)
        print(f"🏷️ Labels added to NER component: {sorted(ner.labels)}")

    print(f"\n🚀 Starting training from scratch for {n_iter} iterations...")
    # Disable other pipes (though likely none in a blank model)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        # Use begin_training for blank models
        try:
            optimizer = nlp.begin_training()
        except Exception as e:
            print(f"Error during nlp.begin_training: {e}")
            raise e

        all_losses = []
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}

            # Set up batching with hyperparameters
            try:
                batch_config = compounding(batch_start, batch_end, batch_compound)
            except ValueError as e:
                print(f"Error creating compounding batch size: {e}. Using fixed size 8.")
                batch_config = 8 # Fallback

            batches = minibatch(train_data, size=batch_config)
            processed_batches = 0
            progress = tqdm(batches, desc=f"Iter {itn+1}", unit="batch", leave=False)

            for batch in progress:
                if not batch: continue
                examples = []
                for doc in batch:
                    try:
                        # We assume train_data has Doc objects with gold standard annotations
                        ex = Example(nlp.make_doc(doc.text), doc)
                        examples.append(ex)
                    except Exception as e:
                        print(f"\nWarning: Unexpected error creating Example: '{doc.text[:30]}...'. Error: {e}")
                        continue

                if not examples: continue

                try:
                    nlp.update(examples, sgd=optimizer, losses=losses, drop=dropout)
                except Exception as e:
                    print(f"\n❌ Error during nlp.update: {e}")
                    print(f"Failed on batch with {len(examples)} examples. First text: '{examples[0].text[:50]}...'")
                    break # Stop this iteration

                processed_batches += 1
                progress.set_postfix(loss=f"{losses.get('ner', 0.0):.3f}", batches=processed_batches)

            ner_loss = losses.get('ner', None)
            all_losses.append(ner_loss)
            loss_str = f"{ner_loss:.3f}" if ner_loss is not None else "N/A"
            print(f"  ✅ Iteration {itn + 1}/{n_iter} - NER Loss: {loss_str}")

    # Save the trained model if path provided
    if output_model_path:
        try:
            nlp.to_disk(output_model_path)
            print(f"\n📦 Blank model saved to {output_model_path}")
        except Exception as e:
            print(f"\n❌ Error saving blank model to {output_model_path}: {e}")

    return nlp, all_losses # Return trained nlp object and losses


In [None]:
# This function is identical to the one in 01a_spacy_pretrained_training.ipynb
# It's included here for completeness.
def evaluate_model(model_path, test_data_path):
    """
    Evaluate the trained model and return scores.
    """
    print(f"\n--- Evaluating Model: {model_path} ---")
    if not os.path.exists(model_path):
        print(f"❌ Model path does not exist: {model_path}")
        return None
    if not os.path.exists(test_data_path):
            print(f"❌ Test data path does not exist: {test_data_path}")
            return None

    try:
        nlp_eval = spacy.load(model_path)
    except Exception as e:
        print(f"❌ Error loading model {model_path} for evaluation: {e}")
        return None

    try:
        doc_bin = DocBin().from_disk(test_data_path)
        test_data = list(doc_bin.get_docs(nlp_eval.vocab))
    except Exception as e:
        print(f"❌ Error loading test documents from {test_data_path}: {e}")
        return None

    if not test_data:
        print(f"❌ No test data loaded from {test_data_path}.")
        return None

    print(f"Evaluating on {len(test_data)} test documents...")
    examples = []
    skipped_examples = 0
    for doc in tqdm(test_data, desc="Creating evaluation examples"):
            try:
                # Create Example objects for evaluation
                ex = Example(nlp_eval(doc.text), doc) # Use nlp_eval to predict
                examples.append(ex)
            except Exception as e:
                print(f"\nWarning: Unexpected error creating Example for evaluation: '{doc.text[:30]}...'. Error: {e}")
                skipped_examples += 1
                continue

    if skipped_examples > 0:
        print(f"Skipped {skipped_examples} examples during evaluation preparation.")

    if not examples:
        print("❌ No valid examples created for evaluation.")
        return None

    try:
        scorer = Scorer()
        scores = scorer.score_examples(examples, "ents") # Evaluate entities only

        # More detailed scores (optional but useful)
        per_entity_scores = scores.get("ents_per_type", {})

    except Exception as e:
        print(f"❌ Error during scoring: {e}")
        return None

    print("\n--- Evaluation Results ---")
    print(f"Overall NER Precision: {scores.get('ents_p', 0.0):.4f}")
    print(f"Overall NER Recall:    {scores.get('ents_r', 0.0):.4f}")
    print(f"Overall NER F1-Score:  {scores.get('ents_f', 0.0):.4f}")

    if per_entity_scores:
        print("\nScores per Entity Type:")
        # Header
        print(f"{'Label':<15} {'P':<10} {'R':<10} {'F':<10}")
        print("-" * 45)
        # Sort by label for consistent output
        for label, metrics in sorted(per_entity_scores.items()):
            p = metrics.get('p', 0.0)
            r = metrics.get('r', 0.0)
            f = metrics.get('f', 0.0)
            print(f"{label:<15} {p:<10.4f} {r:<10.4f} {f:<10.4f}")
    else:
        print("\nNo per-entity scores available.")

    return scores # Return the full scores dictionary


In [None]:
def grid_search_blank(train_data_path, test_data_path, grid_search_output_dir, param_grid):
    """
    Perform grid search over specified hyperparameters for blank model training.
    Saves the best model based on F1-score.
    """
    print("\n--- Starting Grid Search (Base: blank 'en') ---")
    # Ensure grid search directory exists and is clean
    if os.path.exists(grid_search_output_dir):
        print(f"Clearing existing grid search directory: {grid_search_output_dir}")
        shutil.rmtree(grid_search_output_dir)
    os.makedirs(grid_search_output_dir, exist_ok=True)

    # Prepare parameter combinations
    keys, values = zip(*param_grid.items())
    combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
    print(f"Total parameter combinations to test: {len(combinations)}")

    best_f1 = -1.0
    best_params = None
    best_model_path_overall = None
    results = [] # Store results for each combination

    for i, params in enumerate(combinations):
        print(f"\n--- Combination {i+1}/{len(combinations)} ---")
        print(f"Parameters: {params}")
        start_time = time.time()

        # Construct a unique model path for this combination
        param_str = "_".join(f"{k}{v}" for k, v in sorted(params.items()))
        current_model_output_path = os.path.join(grid_search_output_dir, f"model_{param_str}")

        try:
            # Train the blank model with current parameters
            nlp_trained, losses = train_blank_spacy_model(
                train_data_path=train_data_path,
                output_model_path=current_model_output_path,
                n_iter=params.get('n_iter', 10), # Use get with default
                dropout=params.get('dropout', 0.1),
                batch_start=params.get('batch_start', 4.0),
                batch_end=params.get('batch_end', 32.0),
                batch_compound=params.get('batch_compound', 1.5)
            )

            # Evaluate the trained model
            scores = evaluate_model(current_model_output_path, test_data_path)

            end_time = time.time()
            duration = end_time - start_time

            if scores:
                current_f1 = scores.get('ents_f', 0.0)
                results.append({
                    "params": params,
                    "f1": current_f1,
                    "precision": scores.get('ents_p', 0.0),
                    "recall": scores.get('ents_r', 0.0),
                    "duration_seconds": duration,
                    "model_path": current_model_output_path,
                    "final_loss": losses[-1] if losses else None
                })

                print(f"Combination {i+1} F1-Score: {current_f1:.4f} (Duration: {duration:.2f}s)")

                # Check if this is the best model so far
                if current_f1 > best_f1:
                    best_f1 = current_f1
                    best_params = params
                    best_model_path_overall = current_model_output_path
                    print(f"🏆 New Best F1 Score Found!")
            else:
                print(f"Evaluation failed for combination {i+1}. Skipping.")
                results.append({
                    "params": params, "f1": None, "precision": None, "recall": None,
                    "duration_seconds": duration, "model_path": None, "final_loss": None
                })

        except Exception as e:
            print(f"\n❌❌ Error during grid search combination {i+1} ({params}):")
            traceback.print_exc() # Print detailed traceback
            end_time = time.time()
            duration = end_time - start_time
            results.append({
                "params": params, "f1": None, "precision": None, "recall": None,
                "duration_seconds": duration, "model_path": None, "error": str(e)
            })
            # Continue to the next combination

    print("\n--- Grid Search Complete ---")
    if best_params:
        print(f"Best F1-Score: {best_f1:.4f}")
        print(f"Best Parameters: {best_params}")
        print(f"Best Model Path: {best_model_path_overall}")

        # Optionally, copy the best model to a fixed "best_model" directory
        best_model_final_dir = os.path.join(grid_search_output_dir, "best_model_blank_sequential") # Match original script name
        try:
            if os.path.exists(best_model_final_dir):
                shutil.rmtree(best_model_final_dir)
            shutil.copytree(best_model_path_overall, best_model_final_dir)
            print(f"Copied best model to: {best_model_final_dir}")
        except Exception as e:
            print(f"Error copying best model: {e}")
    else:
        print("Grid search did not find a successful model.")

    # Save results summary
    results_summary_path = os.path.join(grid_search_output_dir, "grid_search_summary.json")
    try:
        with open(results_summary_path, 'w', encoding='utf-8') as f:
                # Sort results by F1 score descending for readability
            sorted_results = sorted([r for r in results if r['f1'] is not None], key=lambda x: x['f1'], reverse=True)
            json.dump(sorted_results, f, indent=2)
        print(f"Grid search summary saved to {results_summary_path}")
    except Exception as e:
        print(f"Error saving grid search summary: {e}")

    return best_model_path_overall # Return path to best model dir inside grid search output


In [None]:
# --- Configuration ---
# Source: News Data
SOURCE_NAME = "News"
PREFIX = "blank_news" # Unique prefix for temp files

# Paths relative to this notebook (located in Model Training/News/)
INPUT_JSON_PATH = "../../../Data/Historical News/NER_Data/Labeled/ner_labeled_news_dataset_spacy.json"
# Output of conversion / Input to split - NOW IN DATA FOLDER
FULL_SPACY_DATA_PATH = "../../../Data/Historical News/NER_Data/spaCy_Format/full_data_blank.spacy"
# Output of grid search - NOW IN OUTPUTS FOLDER
GRID_SEARCH_DIR = "../../../outputs/information_extraction/news/grid_search_models_blank_sequential"
BEST_MODEL_DIR = os.path.join(GRID_SEARCH_DIR, "best_model_blank_sequential") # Final best model location

# Grid Search Parameters (Example - Adjust as needed)
param_grid = {
    'n_iter': [15, 25], # Might need more iterations for blank models
    'dropout': [0.1, 0.25, 0.5],
    'batch_start': [4],
    'batch_end': [32, 64],
    'batch_compound': [1.001]
}

TRAIN_RATIO = 0.9

# --- Execution ---
temp_train_path = None
temp_test_path = None
best_model_path = None

# Ensure output directories exist before starting
os.makedirs(os.path.dirname(FULL_SPACY_DATA_PATH), exist_ok=True)
# Grid search function will handle creation of GRID_SEARCH_DIR

try:
    # 1. Convert JSON data to .spacy format
    print("--- Step 1: Converting JSON to spaCy format (Blank) ---")
    convert_json_to_spacy_blank(INPUT_JSON_PATH, FULL_SPACY_DATA_PATH)

    # 2. Split data into temporary train/test files (will be created in current dir)
    print("\n--- Step 2: Splitting data ---")
    temp_train_path, temp_test_path = split_data(FULL_SPACY_DATA_PATH, TRAIN_RATIO, prefix=PREFIX)

    # 3. Run Grid Search
    print("\n--- Step 3: Running Grid Search (Blank) ---")
    best_model_path = grid_search_blank( # Call blank version of grid search
        train_data_path=temp_train_path,
        test_data_path=temp_test_path,
        grid_search_output_dir=GRID_SEARCH_DIR,
        param_grid=param_grid
    )

    # 4. Final Evaluation of the Best Model (optional)
    if best_model_path and os.path.exists(BEST_MODEL_DIR): # Check if best model was found and copied
         print("\n--- Step 4: Final Evaluation of Best Model (Blank) ---")
         evaluate_model(BEST_MODEL_DIR, temp_test_path)
    elif best_model_path:
         print("\n--- Step 4: Final Evaluation Skipped (Best model not copied?) ---")
    else:
         print("\n--- Step 4: Final Evaluation Skipped (No best model found) ---")


except FileNotFoundError as e:
    print(f"\n❌ ERROR: Required file not found. {e}")
except ValueError as e:
    print(f"\n❌ ERROR: Data loading or processing issue. {e}")
except Exception as e:
    print(f"\n❌ An unexpected error occurred during the main execution:")
    traceback.print_exc()
finally:
    # Clean up temporary files
    print("\n--- Cleaning up temporary files ---")
    if temp_train_path and os.path.exists(temp_train_path):
        os.remove(temp_train_path)
        print(f"Removed {temp_train_path}")
    if temp_test_path and os.path.exists(temp_test_path):
        os.remove(temp_test_path)
        print(f"Removed {temp_test_path}")
    print("Cleanup complete.")
