# Notebook Title

## üéØ Goal
- Describe the purpose of this notebook
- What analysis or processing will be done?

## üìö Input/Output
- **Input**: `data/vdeh/processed/XX_input.parquet`
- **Output**: `data/vdeh/processed/XX_output.parquet`
- **Metadata**: `data/vdeh/processed/XX_metadata.json`

In [None]:
# üõ†Ô∏è SETUP: Initialize notebook environment
import sys
from pathlib import Path

# Add src to path (temporary until utils is imported)
project_root = Path.cwd()
while not (project_root / 'config.yaml').exists() and project_root.parent != project_root:
    project_root = project_root.parent
sys.path.insert(0, str(project_root / 'src'))

# Now use the utility function
from utils.notebook_utils import setup_notebook

project_root, config = setup_notebook()
print(f"‚úÖ Project root: {project_root}")
print(f"‚úÖ Project: {config.get('project.name')} v{config.get('project.version')}")

In [None]:
# üì¶ IMPORTS
import pandas as pd
import numpy as np
import json
import logging

# Project imports
# from parsers.vdeh_parser import parse_bibliography
# from config_loader import load_config

# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [None]:
# üìã CONFIGURATION
# Get paths from config
processed_dir = config.project_root / config.get('paths.data.vdeh.processed')
input_path = processed_dir / 'XX_input.parquet'
output_path = processed_dir / 'XX_output.parquet'
metadata_path = processed_dir / 'XX_metadata.json'

print(f"Input:  {input_path}")
print(f"Output: {output_path}")

In [None]:
# üìä LOAD DATA
logger.info(f"Loading data from {input_path}")
df = pd.read_parquet(input_path)

logger.info(f"Loaded {len(df):,} records")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# üîß PROCESSING
# Your data processing code here
logger.info("Starting data processing...")

# Example:
# df_processed = df.copy()
# df_processed['new_column'] = ...

logger.info("Processing complete")

In [None]:
# üíæ SAVE RESULTS
logger.info(f"Saving results to {output_path}")
# df_processed.to_parquet(output_path, index=False)

# Save metadata
metadata = {
    'stage': 'XX_stage_name',
    'input_file': str(input_path),
    'output_file': str(output_path),
    'records_processed': len(df),
    'columns': list(df.columns),
    'timestamp': pd.Timestamp.now().isoformat()
}

with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

logger.info("‚úÖ Results saved successfully")
print(f"‚û°Ô∏è  Next step: XX_next_notebook.ipynb")