##  GEMINI OUTPUT, TABLES OUTPUT IN JSON AND MMD AS WELL, ANALYSIS FROM GEMINI ON ERRORS , CER , CONVERTED PDF OF GD TO JSON USING GEMINI SINCE LATEX OF GROUND WAS NOT RIGHT


# send the ground truth to gemini

In [None]:
import os
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import google.generativeai as genai
import sys
from pathlib import Path

# Handle both notebook and script environments
try:
    # Try to get the script directory (works in .py files)
    script_dir = Path(__file__).parent
except NameError:
    # Fallback for Jupyter notebooks
    script_dir = Path.cwd()
    print("⚠️  Running in notebook environment, using current working directory")

# Add the path to access prompt_store.py using relative path
project_root = script_dir.parent.parent.parent
ocr_path = project_root / "ocr"
sys.path.append(str(ocr_path))

try:
    from prompt_store import v15
    print("✅ Successfully imported v15 prompt")
except ImportError as e:
    print(f"❌ Failed to import prompt_store: {e}")
    print(f"🔍 Tried to import from: {ocr_path}")
    print(f"📁 Current script directory: {script_dir}")
    print(f"📁 Project root: {project_root}")
    sys.exit(1)

# === Load API Key ===
load_dotenv()

model_name = "gemini-2.5-pro"
api_key = os.getenv("GOOGLE_GEMINI_API")

if not api_key:
    print("❌ GOOGLE_GEMINI_API environment variable not found!")
    print("Please make sure you have a .env file with your API key")
    sys.exit(1)

genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name)

class ProcessingTracker:
    def __init__(self):
        self.total_files = 0
        self.processed_files = 0
        self.successful_files = 0
        self.failed_files = 0
        self.json_files = 0
        self.text_files = 0
        self.errors = []
        self.start_time = None
        self.end_time = None
    
    def start_processing(self):
        self.start_time = datetime.now()
        print(f"🚀 Started processing at {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 60)
    
    def end_processing(self):
        self.end_time = datetime.now()
        duration = self.end_time - self.start_time
        print("\n" + "=" * 60)
        print("📊 PROCESSING SUMMARY")
        print("=" * 60)
        print(f"Total PDF files found:     {self.total_files}")
        print(f"Successfully processed:    {self.successful_files}")
        print(f"Failed to process:         {self.failed_files}")
        print(f"Valid JSON outputs:        {self.json_files}")
        print(f"Text outputs (invalid JSON): {self.text_files}")
        print(f"Processing time:           {duration}")
        print(f"Completed at:              {self.end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        if self.errors:
            print(f"\n❌ ERRORS ENCOUNTERED ({len(self.errors)}):")
            print("-" * 40)
            for i, error in enumerate(self.errors, 1):
                print(f"{i}. {error}")
        else:
            print(f"\n✅ No errors encountered!")
        print("=" * 60)
    
    def add_error(self, error_msg):
        self.errors.append(error_msg)
        self.failed_files += 1

def send_pdf_to_gemini_and_save_json(pdf_path, prompt, output_base_dir, tracker, file_index):
    try:
        pdf_filename = os.path.basename(pdf_path)
        print(f"\n📄 [{file_index}/{tracker.total_files}] Processing: {pdf_filename}")
        
        # Upload the PDF file
        print("   ⬆️  Uploading PDF to Gemini...")
        file_resource = genai.upload_file(pdf_path, mime_type="application/pdf")
        
        # Compose the prompt and file
        print("   🤖 Generating content with Gemini...")
        start_time = time.time()
        response = model.generate_content([prompt, file_resource])
        end_time = time.time()
        generated_text = response.text
        
        processing_time = end_time - start_time
        print(f"   ⏱️  Gemini processing time: {processing_time:.2f} seconds")

        # Extract the filename and set output directory for JSON file
        pdf_stem = os.path.splitext(pdf_filename)[0]
        output_dir = os.path.join(output_base_dir, pdf_stem)
        os.makedirs(output_dir, exist_ok=True)

        # Try to validate JSON before saving
        try:
            # Attempt to parse as JSON to validate
            json_data = json.loads(generated_text)
            output_json_path = os.path.join(output_dir, f"{pdf_stem}.json")
            # Save as properly formatted JSON
            with open(output_json_path, 'w', encoding='utf-8') as output_file:
                json.dump(json_data, output_file, indent=2, ensure_ascii=False)
            print(f"   ✅ Valid JSON saved: {output_json_path}")
            tracker.json_files += 1
            tracker.successful_files += 1
            
        except json.JSONDecodeError as json_error:
            # If not valid JSON, save as text file
            output_txt_path = os.path.join(output_dir, f"{pdf_stem}.json")
            with open(output_txt_path, 'w', encoding='utf-8') as output_file:
                output_file.write(generated_text)
            print(f"   ⚠️  Invalid JSON, saved as text: {output_txt_path}")
            print(f"   📝 JSON Error: {str(json_error)[:100]}...")
            tracker.text_files += 1
            tracker.successful_files += 1
        
        tracker.processed_files += 1
        
    except Exception as e:
        error_msg = f"File: {pdf_filename} - Error: {str(e)}"
        print(f"   ❌ Error processing {pdf_filename}: {str(e)}")
        tracker.add_error(error_msg)

# Function to process all PDF files in a directory
def process_all_pdfs(input_dir, output_dir, prompt):
    tracker = ProcessingTracker()
    
    if not os.path.exists(input_dir):
        print(f"❌ Input directory does not exist: {input_dir}")
        print(f"🔍 Tried path: {input_dir}")
        return tracker
    
    # First, count all PDF files
    print(f"🔍 Scanning for PDF files in: {input_dir}")
    pdf_files = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    
    tracker.total_files = len(pdf_files)
    print(f"📁 Found {tracker.total_files} PDF files")
    
    if tracker.total_files == 0:
        print("❌ No PDF files found in the specified directory")
        return tracker
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    print(f"📂 Output directory: {output_dir}")
    
    tracker.start_processing()
    
    # Process each PDF file
    for index, pdf_path in enumerate(pdf_files, 1):
        send_pdf_to_gemini_and_save_json(pdf_path, prompt, output_dir, tracker, index)
        
        # Show progress
        progress = (index / tracker.total_files) * 100
        print(f"   📈 Progress: {progress:.1f}% ({index}/{tracker.total_files})")
    
    tracker.end_processing()
    return tracker

# === Main Function for Easy Usage ===
def main(input_dir=None, output_dir=None):
    print("🎯 PDF to JSON Processor with Gemini AI")
    print("=" * 60)
    
    # Use provided paths or default relative paths
    if input_dir is None:
        input_pdf_dir = script_dir / "Physics_human" / "Physics_pdf_docx_human_ocr"
    else:
        input_pdf_dir = Path(input_dir)
    
    if output_dir is None:
        output_json_dir = script_dir / "Physics_human" / "pdf_docx_json"
    else:
        output_json_dir = Path(output_dir)
    
    # Convert to strings for compatibility
    input_pdf_dir = str(input_pdf_dir)
    output_json_dir = str(output_json_dir)
    
    print(f"📂 Input directory:  {input_pdf_dir}")
    print(f"📂 Output directory: {output_json_dir}")
    print(f"🤖 Using model:      {model_name}")
    print(f"📋 Using prompt:     v15")
    
    # Start processing all PDFs in the input directory
    result_tracker = process_all_pdfs(input_pdf_dir, output_json_dir, v15)
    
    # Final status
    if result_tracker.total_files > 0:
        success_rate = (result_tracker.successful_files / result_tracker.total_files) * 100
        print(f"\n🎉 Overall success rate: {success_rate:.1f}%")
        
        if result_tracker.failed_files > 0:
            print(f"⚠️  {result_tracker.failed_files} files failed to process")
        else:
            print("🎊 All files processed successfully!")
    else:
        print("❌ No files were processed")
    
    return result_tracker

# === Example Usage ===
if __name__ == "__main__":
    main()

⚠️  Running in notebook environment, using current working directory
✅ Successfully imported v15 prompt
🎯 PDF to JSON Processor with Gemini AI
📂 Input directory:  /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/Physics_pdf_docx_human_ocr
📂 Output directory: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json
🤖 Using model:      gemini-2.5-pro
📋 Using prompt:     v15
🔍 Scanning for PDF files in: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/Physics_pdf_docx_human_ocr
📁 Found 15 PDF files
📂 Output directory: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json
🚀 Started processing at 2025-07-30 15:13:33

📄 [1/15] Processing: 12_10021401989941211

## to clean the json 

In [7]:
import os
import json

def clean_json_content(file_path):
    """
    Cleans the JSON file by removing the first and last lines from the file.
    """
    try:
        # Check if the file is empty
        if os.path.getsize(file_path) == 0:
            print(f"Skipping empty file: {file_path}")
            return

        # Open the file to read the raw content
        with open(file_path, 'r') as file:
            lines = file.readlines()

        # Ensure the file has more than two lines (i.e., has content to remove from both ends)
        if len(lines) <= 2:
            print(f"Skipping file with not enough content to clean: {file_path}")
            return
        
        # Remove the first and last lines
        cleaned_lines = lines[1:-1]

        # Join the cleaned lines and load the cleaned data as JSON
        cleaned_data = "".join(cleaned_lines)
        try:
            data = json.loads(cleaned_data)
        except json.JSONDecodeError as e:
            print(f"Error reading JSON from file {file_path}: {e}")
            return

        # Save the cleaned JSON data back to the file
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=2)
        
        print(f"Successfully cleaned and saved: {file_path}")
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

def clean_json_files_in_directory(directory_path):
    """
    Loops through the directory and all its subdirectories, cleaning each JSON file.
    """
    for subdir, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(subdir, filename)
                clean_json_content(file_path)

# Set the directory path where the JSON files are located
directory_path = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json"

# Clean all JSON files in the directory and its subdirectories
clean_json_files_in_directory(directory_path)


Successfully cleaned and saved: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json/12_1002140198994121111692513661/12_1002140198994121111692513661.json
Successfully cleaned and saved: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json/09_1002114885961841111690700733/09_1002114885961841111690700733.json
Successfully cleaned and saved: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json/10_10021138351083421111694954514/10_10021138351083421111694954514.json
Successfully cleaned and saved: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json/02_1002137635994121111692517447/02_1002137635994121111692517447.json
Successfully clean

## to break the json into each question for each chapter

In [42]:
import os
import json

def create_solution_jsons(input_file_path, output_directory):
    """
    Create individual solution JSON files for each question in the input JSON.
    """
    try:
        # Read the input JSON file
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        
        # Extract the file name without extension and create a new directory for the solution files
        directory_name = os.path.splitext(os.path.basename(input_file_path))[0]
        
        # Extract the first part of the directory name (before the first '_')
        folder_prefix = directory_name.split('_')[0]  # Gets the first part before '_'
        
        # Define output folder
        output_folder = os.path.join(output_directory, directory_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # Loop through each question and create a new JSON for each
        for entry in data:
            question_number = entry.get('question_number')
            solution = [{
                "question_number": question_number,
                "ocr_text": entry.get('ocr_text'),
                "diagrams": entry.get('diagrams', []),
                "pages": entry.get('pages', [])
            }]
            
            # Define the solution file path with the new naming convention
            solution_file_path = os.path.join(output_folder, f"{folder_prefix}_solution_{question_number}.json")
            
            # Write the solution JSON to the file
            with open(solution_file_path, 'w') as solution_file:
                json.dump(solution, solution_file, indent=2)
            
            print(f"Solution {question_number} saved in {solution_file_path}")
    
    except Exception as e:
        print(f"Error processing file {input_file_path}: {e}")

def process_json_files_in_directory(directory_path, output_directory):
    """
    Process all JSON files in the specified directory and its subdirectories.
    """
    for subdir, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(subdir, filename)
                create_solution_jsons(file_path, output_directory)


# Set the directory path where the JSON files are located
directory_path = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/pdf_docx_json"  # Replace with the actual path

# Set the output directory where you want to save the solutions
output_directory = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter"

# Process all JSON files in the directory and its subdirectories
process_json_files_in_directory(directory_path, output_directory)


Solution 1 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_1.json
Solution 2 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_2.json
Solution 3 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_3.json
Solution 4 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_4.json
Solution 1 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_

## send the PREDICTION IMAGES TO GEMINI PRO

In [61]:
# 🚀 COMPLETE OCR WITH CSV QUESTIONS - BATCH PROCESS ENTIRE PDF DIRECTORY
import sys, os, time, json, shutil, importlib.util, pandas as pd
from dotenv import load_dotenv
import glob

print("🚀 ENHANCED V13 OCR - BATCH PROCESS PDF DIRECTORY WITH HTML CLEANING!")
print("=" * 70)

# === SETUP ===
physics_dir = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics"
parent_dir = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading"
solution_dir = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement"

# CORRECTED CSV PATH
hw_solution_with_qb_meta_csv = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/hw_df_with_solutions_and_questions.csv"

# PDF DIRECTORY TO PROCESS
PDF_DIRECTORY = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/phy"

load_dotenv(os.path.join(solution_dir, ".env"))
sys.path.insert(0, physics_dir)
sys.path.insert(1, parent_dir)

class MockST:
    def __init__(self): self.secrets = {'GOOGLE_GEMINI_API': os.getenv('GOOGLE_GEMINI_API', '')}
    def error(self, m): print(f'❌ {m}')
    def info(self, m): print(f'ℹ️  {m}')
    def warning(self, m): print(f'⚠️  {m}')
    def success(self, m): print(f'✅ {m}')
sys.modules['streamlit'] = MockST()

# Import HTML cleaner from the specific file
try:
    from html_text_cleaner import extract_text_from_html
    print("✅ HTML cleaner imported successfully from html_text_cleaner.py")
except ImportError as e:
    print(f"❌ Failed to import HTML cleaner from html_text_cleaner.py: {e}")
    raise ImportError("html_text_cleaner.py is required for this script to work")

# Import other modules
try:
    from gemini import ocr_with_questions, ocr_pdf
    print("✅ Gemini modules imported successfully")
    print(f"📄 CSV path: {hw_solution_with_qb_meta_csv}")
    print(f"📄 CSV exists: {os.path.exists(hw_solution_with_qb_meta_csv)}")
    print(f"📁 PDF directory: {PDF_DIRECTORY}")
    print(f"📁 Directory exists: {os.path.exists(PDF_DIRECTORY)}")
except Exception as e:
    print(f"❌ Import error: {e}")

def remove_prefix(filename):
    """Remove numeric prefix like 01_, 02_, etc. from filename"""
    if '_' in filename:
        parts = filename.split('_', 1)
        if parts[0].isdigit():
            return parts[1]
    return filename

def extract_pdf_name_from_url(url):
    """Extract PDF name from UPLOADED_ANS URL"""
    if pd.isna(url) or not isinstance(url, str):
        return None
    
    # Extract filename from URL (last part after /)
    filename = url.split('/')[-1]
    
    # Remove .pdf extension if present
    if filename.endswith('.pdf'):
        filename = filename[:-4]
    
    return filename

def clean_escaped_html(text):
    """Clean both regular HTML and escaped HTML tags"""
    import re
    
    if not text:
        return text
    
    # First unescape the HTML entities that were escaped in JSON
    # Convert <\/div> back to </div>, <\/strong> to </strong>, etc.
    text = text.replace('\\/', '/')
    
    # Now use the original HTML cleaner
    cleaned = extract_text_from_html(text)
    
    return cleaned

def extract_question_from_json_content(json_content):
    """Extract and clean question text from JSON content"""
    try:
        # Parse the JSON string
        data = json.loads(json_content)
        
        # Handle both single object and array
        if isinstance(data, list) and len(data) > 0:
            question_data = data[0]
        else:
            question_data = data
        
        # Extract question text from questionStem.text
        question_text = None
        if 'questionStem' in question_data and 'text' in question_data['questionStem']:
            question_text = question_data['questionStem']['text']
        
        if question_text:
            # Remove outer quotes if present
            if question_text.startswith('"') and question_text.endswith('"'):
                question_text = question_text[1:-1]
            
            # Clean HTML using enhanced cleaner that handles escaped HTML
            cleaned_text = clean_escaped_html(question_text)
            
            return cleaned_text
        
        return None
        
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing error: {e}")
        return None
    except Exception as e:
        print(f"❌ Error extracting question: {e}")
        return None

def load_questions_for_pdf_with_json_parsing(pdf_name, csv_path):
    """Load questions from CSV, parse JSON content, and clean HTML"""
    try:
        # Load CSV (cache it to avoid reloading for each PDF)
        if not hasattr(load_questions_for_pdf_with_json_parsing, 'cached_df'):
            print(f"📊 Loading CSV: {csv_path}")
            load_questions_for_pdf_with_json_parsing.cached_df = pd.read_csv(csv_path, low_memory=False)
            print(f"✅ CSV loaded: {len(load_questions_for_pdf_with_json_parsing.cached_df)} rows")
            
            # Create extracted PDF names column once
            load_questions_for_pdf_with_json_parsing.cached_df['pdf_name_extracted'] = \
                load_questions_for_pdf_with_json_parsing.cached_df['UPLOADED_ANS'].apply(extract_pdf_name_from_url)
        
        df = load_questions_for_pdf_with_json_parsing.cached_df
        
        # Extract PDF name if needed
        base_pdf_name = remove_prefix(pdf_name.replace('.pdf', ''))
        
        # Filter by PDF name using the extracted names
        pdf_df = df[df['pdf_name_extracted'].str.contains(base_pdf_name, na=False)]
        
        if len(pdf_df) == 0:
            print(f"⚠️  No questions found for PDF: {base_pdf_name}")
            return []
        
        print(f"✅ Found {len(pdf_df)} rows for PDF: {base_pdf_name}")
        
        # Extract and clean questions from JSON content
        questions = []
        for idx, row in pdf_df.iterrows():
            raw_content = None
            if pd.notna(row.get('content')):
                raw_content = str(row['content'])
            elif pd.notna(row.get('textsolutions')):
                raw_content = str(row['textsolutions'])
            
            if raw_content:
                # Extract question from JSON and clean HTML (including escaped HTML)
                cleaned_question = extract_question_from_json_content(raw_content)
                
                if cleaned_question and cleaned_question.strip():
                    questions.append(cleaned_question)
        
        print(f"📝 Extracted {len(questions)} cleaned questions for {pdf_name}")
        return questions
        
    except Exception as e:
        print(f"❌ Error loading questions for {pdf_name}: {e}")
        return []

def process_single_pdf_with_questions(pdf_path, pdf_name):
    """Process a single PDF with cleaned questions from CSV"""
    
    print(f"\n{'='*60}")
    print(f"🔧 Processing PDF: {pdf_name}")
    print(f"📄 Full path: {pdf_path}")
    
    # Load questions from CSV with JSON parsing and enhanced HTML cleaning
    questions = load_questions_for_pdf_with_json_parsing(pdf_name, hw_solution_with_qb_meta_csv)
    
    if not questions:
        print("⚠️  No questions found, falling back to basic OCR")
        result = ocr_pdf(pdf_path, physics_dir, None, "v13")
    else:
        print(f"🎯 Processing with {len(questions)} cleaned questions")
        print(f"📝 Question preview: {questions[0][:80]}..." if questions else "")
            
        # Enhanced OCR with fully cleaned questions
        result = ocr_with_questions(
            questions=questions,
            pdf_file_path=pdf_path,
            output_folder=physics_dir,
            cache_dir=None,
            prompt_version="v13"
        )
    
    if result:
        # Find and copy output file for this specific PDF
        pdf_base_name = pdf_name.replace('.pdf', '')
        for search_pattern in [
            f"{physics_dir}/output/{pdf_base_name}/json/output.json",
            f"{physics_dir}/output/output.json",
            f"{physics_dir}/output/OUTPUT_JSON/output.json"
        ]:
            if os.path.exists(search_pattern):
                # Create PDF-specific output directory
                target_dir = f"{physics_dir}/output/batch_results/{pdf_base_name}"
                os.makedirs(target_dir, exist_ok=True)
                target_path = f"{target_dir}/output.json"
                
                shutil.copy2(search_pattern, target_path)
                print(f"📄 Output saved to: {target_path}")
                
                # Verify the output
                with open(target_path, 'r') as f:
                    data = json.load(f)
                    print(f"✅ Success: {len(data)} questions processed for {pdf_name}")
                    
                    # Quick verification that questions are clean
                    clean_count = 0
                    for item in data:
                        if 'question_text' in item:
                            if not any(tag in item['question_text'] for tag in ['</', '<div', '<strong', '<\/div', '<\/strong']):
                                clean_count += 1
                    
                    print(f"🧹 {clean_count}/{len(data)} questions are fully cleaned")
                    
                return True
        
        print(f"⚠️  Output file not found for {pdf_name}")
        return False
    else:
        print(f"❌ OCR processing failed for {pdf_name}")
        return False

def process_pdf_directory(pdf_directory):
    """Process all PDF files in the given directory"""
    
    print(f"\n🎯 STARTING BATCH PDF PROCESSING...")
    print(f"📁 Directory: {pdf_directory}")
    
    # Find all PDF files in the directory
    pdf_pattern = os.path.join(pdf_directory, "*.pdf")
    pdf_files = glob.glob(pdf_pattern)
    
    if not pdf_files:
        print(f"❌ No PDF files found in {pdf_directory}")
        return
    
    print(f"📋 Found {len(pdf_files)} PDF files to process")
    
    # Statistics
    successful = 0
    failed = 0
    total_questions = 0
    
    # Process each PDF file
    for i, pdf_path in enumerate(pdf_files, 1):
        pdf_name = os.path.basename(pdf_path)
        
        print(f"\n🔄 Processing {i}/{len(pdf_files)}: {pdf_name}")
        
        try:
            success = process_single_pdf_with_questions(pdf_path, pdf_name)
            if success:
                successful += 1
                print(f"✅ Successfully processed {pdf_name}")
            else:
                failed += 1
                print(f"❌ Failed to process {pdf_name}")
                
        except Exception as e:
            failed += 1
            print(f"❌ Error processing {pdf_name}: {e}")
    
    # Final summary
    print(f"\n{'='*60}")
    print(f"🏁 BATCH PROCESSING COMPLETE!")
    print(f"📊 Results:")
    print(f"   ✅ Successful: {successful}")
    print(f"   ❌ Failed: {failed}")
    print(f"   📁 Total files: {len(pdf_files)}")
    print(f"   📍 Results saved in: {physics_dir}/output/batch_results/")
    
    # List all generated output files
    batch_results_dir = f"{physics_dir}/output/batch_results"
    if os.path.exists(batch_results_dir):
        result_dirs = [d for d in os.listdir(batch_results_dir) if os.path.isdir(os.path.join(batch_results_dir, d))]
        print(f"\n📄 Generated output files:")
        for result_dir in result_dirs:
            output_file = os.path.join(batch_results_dir, result_dir, "output.json")
            if os.path.exists(output_file):
                print(f"   {result_dir}/output.json")

# === EXECUTION ===
if __name__ == "__main__":
    print(f"📊 CSV: {os.path.basename(hw_solution_with_qb_meta_csv)}")
    
    # Process entire PDF directory
    process_pdf_directory(PDF_DIRECTORY)
    
    print(f"\n🏁 ALL PROCESSING COMPLETE!")

🚀 ENHANCED V13 OCR - BATCH PROCESS PDF DIRECTORY WITH HTML CLEANING!
✅ HTML cleaner imported successfully from html_text_cleaner.py
✅ Gemini modules imported successfully
📄 CSV path: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/hw_df_with_solutions_and_questions.csv
📄 CSV exists: True
📁 PDF directory: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/phy
📁 Directory exists: True
📊 CSV: hw_df_with_solutions_and_questions.csv

🎯 STARTING BATCH PDF PROCESSING...
📁 Directory: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/phy
📋 Found 15 PDF files to process

🔄 Processing 1/15: 12_1002140198994121111692513661.pdf

🔧 Processing PDF: 12_1002140198994121111692513661.pdf
📄 Full path: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution

## to break the json into each quesiton 

In [None]:
import os
import json

def create_solution_jsons(input_file_path, output_directory):
    """
    Create individual solution JSON files for each question in the input JSON.
    """
    try:
        # Read the input JSON file
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        
        # Extract the file name without extension and create a new directory for the solution files
        directory_name = os.path.splitext(os.path.basename(input_file_path))[0]
        
        # Extract the first part of the directory name (before the first '_')
        folder_prefix = directory_name.split('_')[0]  # Gets the first part before '_'
        
        # Define output folder
        output_folder = os.path.join(output_directory, directory_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # Loop through each question and create a new JSON for each
        for entry in data:
            question_number = entry.get('question_number')
            solution = [{
                "question_number": question_number,
                "ocr_text": entry.get('ocr_text'),
                "diagrams": entry.get('diagrams', []),
                "pages": entry.get('pages', [])
            }]
            
            # Define the solution file path with the new naming convention
            solution_file_path = os.path.join(output_folder, f"{folder_prefix}_solution_{question_number}.json")
            
            # Write the solution JSON to the file
            with open(solution_file_path, 'w') as solution_file:
                json.dump(solution, solution_file, indent=2)
            
            print(f"Solution {question_number} saved in {solution_file_path}")
    
    except Exception as e:
        print(f"Error processing file {input_file_path}: {e}")

def process_json_files_in_directory(directory_path, output_directory):
    """
    Process all JSON files in the specified directory and its subdirectories.
    """
    for subdir, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(subdir, filename)
                create_solution_jsons(file_path, output_directory)

# Set the directory path where the JSON files are located
directory_path = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/phy_json"  # Replace with the actual path

# Set the output directory where you want to save the solutions
output_directory = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions"

# Process all JSON files in the directory and its subdirectories
process_json_files_in_directory(directory_path, output_directory)

ModuleNotFoundError: No module named 'ui'

In [41]:
import os
import json

def create_solution_jsons(input_file_path, output_directory):
    """
    Create individual solution JSON files for each question in the input JSON.
    """
    try:
        # Read the input JSON file
        with open(input_file_path, 'r') as file:
            data = json.load(file)
        
        # Extract the file name without extension and create a new directory for the solution files
        directory_name = os.path.splitext(os.path.basename(input_file_path))[0]
        
        # Extract the first part of the directory name (before the first '_')
        folder_prefix = directory_name.split('_')[0]  # Gets the first part before '_'
        
        # Define output folder
        output_folder = os.path.join(output_directory, directory_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # Loop through each question and create a new JSON for each
        for entry in data:
            question_number = entry.get('question_number')
            solution = [{
                "question_number": question_number,
                "ocr_text": entry.get('ocr_text'),
                "diagrams": entry.get('diagrams', []),
                "pages": entry.get('pages', [])
            }]
            
            # Define the solution file path with the new naming convention
            solution_file_path = os.path.join(output_folder, f"{folder_prefix}_solution_{question_number}.json")
            
            # Write the solution JSON to the file
            with open(solution_file_path, 'w') as solution_file:
                json.dump(solution, solution_file, indent=2)
            
            print(f"Solution {question_number} saved in {solution_file_path}")
    
    except Exception as e:
        print(f"Error processing file {input_file_path}: {e}")

def process_json_files_in_directory(directory_path, output_directory):
    """
    Process all JSON files in the specified directory and its subdirectories.
    """
    for subdir, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(subdir, filename)
                create_solution_jsons(file_path, output_directory)

# Set the directory path where the JSON files are located
directory_path = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/phy_json"  # Replace with the actual path

# Set the output directory where you want to save the solutions
output_directory = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions"

# Process all JSON files in the directory and its subdirectories
process_json_files_in_directory(directory_path, output_directory)


Solution 1 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions/12_1002140198994121111692513661/12_solution_1.json
Solution 2 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions/12_1002140198994121111692513661/12_solution_2.json
Solution 3 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions/12_1002140198994121111692513661/12_solution_3.json
Solution 4 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions/12_1002140198994121111692513661/12_solution_4.json
Solution 1 saved in /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/s

# to make a json of human and gemini ocr

In [71]:
import os
import json

def compare_ocr_folders(human_ocr_path, gemini_ocr_path, solution_folder, solution_number, file_prefix):
    human_solution_path = os.path.join(
        human_ocr_path, 'Physics/Physics_human/solutions_chapter', solution_folder,
        f"{file_prefix}_solution_{solution_number}.json"
    )
    gemini_solution_path = os.path.join(
        gemini_ocr_path, 'Physics/Physics_Gemini/solutions', solution_folder,
        f"{file_prefix}_solution_{solution_number}.json"
    )
    
    print(f"Processing: {human_solution_path}")
    print(f"Processing: {gemini_solution_path}")
    
    # Initialize simplified data structure (only text and question number)
    combined_data = {
        "question_number": solution_number,
        "human_text": "NA",
        "gemini_text": "NA"
    }
    
    # Human OCR
    if os.path.exists(human_solution_path):
        with open(human_solution_path, 'r') as human_file:
            try:
                human_data = json.load(human_file)
                if isinstance(human_data, list) and len(human_data) > 0:
                    human_item = human_data[0]
                    combined_data["human_text"] = human_item.get('ocr_text', 'NA')
                    combined_data["question_number"] = human_item.get('question_number', solution_number)
                    print(f"Human OCR ocr_text: {repr(combined_data['human_text'])}")
                else:
                    print(f"Warning: {human_solution_path} is not a list or is empty.")
            except Exception as e:
                print(f"Error reading {human_solution_path}: {e}")
    else:
        print(f"File not found: {human_solution_path}")
    
    # Gemini OCR
    if os.path.exists(gemini_solution_path):
        with open(gemini_solution_path, 'r') as gemini_file:
            try:
                gemini_data = json.load(gemini_file)
                if isinstance(gemini_data, list) and len(gemini_data) > 0:
                    gemini_item = gemini_data[0]
                    combined_data["gemini_text"] = gemini_item.get('ocr_text', 'NA')
                    print(f"Gemini OCR ocr_text: {repr(combined_data['gemini_text'])}")
                else:
                    print(f"Warning: {gemini_solution_path} is not a list or is empty.")
            except Exception as e:
                print(f"Error reading {gemini_solution_path}: {e}")
    else:
        print(f"File not found: {gemini_solution_path}")
    
    # Create output directory
    output_dir = f'/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/tables/{solution_folder}'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save as JSON file
    json_filename = os.path.join(output_dir, f"{file_prefix}_solution_{solution_number}_table.json")
    with open(json_filename, 'w') as json_file:
        json.dump(combined_data, json_file, indent=2)
    print(f"Created {json_filename}")

def process_folders(human_ocr_base_path, gemini_ocr_base_path):
    human_ocr_path = os.path.join(human_ocr_base_path, 'Physics/Physics_human/solutions_chapter')
    gemini_ocr_path = os.path.join(gemini_ocr_base_path, 'Physics/Physics_Gemini/solutions')
    
    for solution_folder in os.listdir(human_ocr_path):
        solution_folder_path_human = os.path.join(human_ocr_path, solution_folder)
        if not os.path.isdir(solution_folder_path_human):
            continue
        for fname in os.listdir(solution_folder_path_human):
            if fname.endswith('.json') and '_solution_' in fname:
                try:
                    file_prefix = fname.split('_solution_')[0]
                    solution_number = int(fname.split('_solution_')[1].split('.')[0])
                except Exception:
                    print(f"Filename parse error: {fname}")
                    continue
                compare_ocr_folders(human_ocr_base_path, gemini_ocr_base_path, solution_folder, solution_number, file_prefix)

# Set paths to your directories
human_ocr_base_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr'
gemini_ocr_base_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr'

# Process the folders
process_folders(human_ocr_base_path, gemini_ocr_base_path)

Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_1.json
Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions/12_1002140198994121111692513661/12_solution_1.json
Human OCR ocr_text: '1. Convex lens'
Gemini OCR ocr_text: '1) Convex lens'
Created /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/tables/12_1002140198994121111692513661/12_solution_1_table.json
Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_2.json
Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvemen

## to make cer

In [72]:
import os
import json
from difflib import ndiff

def char_error_rate(s1, s2):
    """
    Calculate the character error rate (CER) between two strings.
    Returns 'na' if either string is 'na'.
    """
    if s1 == "na" or s2 == "na":
        return "na"
    diff = list(ndiff(s1, s2))
    insertions = sum(1 for d in diff if d[0] == '+')
    deletions = sum(1 for d in diff if d[0] == '-')
    ref_len = len(s1)
    if ref_len == 0:
        return 0 if len(s2) == 0 else 1
    cer = (insertions + deletions) / ref_len
    return cer

def highlight_differences(s1, s2):
    """
    Highlight the differences between two strings.
    Returns 'na' if either string is 'na'.
    """
    if s1 == "na" or s2 == "na":
        return "na"
    diff = list(ndiff(s1, s2))
    result = []
    for d in diff:
        if d[0] == ' ':
            result.append(d[2])
        elif d[0] == '-':
            result.append(f"[-{d[2]}-]")
        elif d[0] == '+':
            result.append(f"[+{d[2]}+]")
    return ''.join(result)

def compare_ocr_folders_with_cer(human_ocr_path, gemini_ocr_path, solution_folder, solution_number, file_prefix):
    human_solution_path = os.path.join(
        human_ocr_path, 'Physics/Physics_human/solutions_chapter', solution_folder,
        f"{file_prefix}_solution_{solution_number}.json"
    )
    gemini_solution_path = os.path.join(
        gemini_ocr_path, 'Physics/Physics_Gemini/solutions', solution_folder,
        f"{file_prefix}_solution_{solution_number}.json"
    )
    
    print(f"Processing: {human_solution_path}")
    print(f"Processing: {gemini_solution_path}")
    
    # Initialize content variables
    human_content = "NA"
    gemini_content = "NA"
    cer = "NA"
    highlight_diff = "NA"
    
    # Check if the Human OCR solution file exists and extract the 'ocr_text'
    if os.path.exists(human_solution_path):
        with open(human_solution_path, 'r') as human_file:
            try:
                human_data = json.load(human_file)
                if isinstance(human_data, list) and len(human_data) > 0:
                    human_content = human_data[0].get('ocr_text', 'NA')
                    print(f"Human OCR ocr_text: {repr(human_content)}")
            except Exception as e:
                print(f"Error reading {human_solution_path}: {e}")
    else:
        print(f"File not found: {human_solution_path}")
    
    # Check if the Gemini OCR solution file exists and extract the 'ocr_text'
    if os.path.exists(gemini_solution_path):
        with open(gemini_solution_path, 'r') as gemini_file:
            try:
                gemini_data = json.load(gemini_file)
                if isinstance(gemini_data, list) and len(gemini_data) > 0:
                    gemini_content = gemini_data[0].get('ocr_text', 'NA')
                    print(f"Gemini OCR ocr_text: {repr(gemini_content)}")
            except Exception as e:
                print(f"Error reading {gemini_solution_path}: {e}")
    else:
        print(f"File not found: {gemini_solution_path}")
    
    # Replace line breaks with <br> in both OCR contents (even if it's "NA")
    human_content_formatted = human_content.replace("\n", "<br>") if human_content != "NA" else "NA"
    gemini_content_formatted = gemini_content.replace("\n", "<br>") if gemini_content != "NA" else "NA"
    
    # If either Human OCR or Gemini OCR is "NA", set CER and highlight differences to "NA"
    if human_content == "NA" or gemini_content == "NA":
        cer = "NA"
        highlight_diff = "NA"
    else:
        # Calculate CER and highlight differences using formatted content
        cer = char_error_rate(human_content_formatted, gemini_content_formatted)
        highlight_diff = highlight_differences(human_content_formatted, gemini_content_formatted)
    
    # Create JSON structure with all comparison data
    comparison_data = {
        "question_number": solution_number,
        "human_text": human_content_formatted,
        "gemini_text": gemini_content_formatted,
        "cer": cer,
        "highlight_difference": highlight_diff
    }
    
    # Create output directory
    output_dir = f'/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/tables_cer/{solution_folder}'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save as JSON file
    json_filename = os.path.join(output_dir, f"{file_prefix}_solution_{solution_number}_table.json")
    with open(json_filename, 'w') as json_file:
        json.dump(comparison_data, json_file, indent=2)
    print(f"Created {json_filename}")

def process_folders_with_cer(human_ocr_base_path, gemini_ocr_base_path):
    human_ocr_path = os.path.join(human_ocr_base_path, 'Physics/Physics_human/solutions_chapter')
    gemini_ocr_path = os.path.join(gemini_ocr_base_path, 'Physics/Physics_Gemini/solutions')
    
    for solution_folder in os.listdir(human_ocr_path):
        solution_folder_path_human = os.path.join(human_ocr_path, solution_folder)
        if not os.path.isdir(solution_folder_path_human):
            continue
        for fname in os.listdir(solution_folder_path_human):
            if fname.endswith('.json') and '_solution_' in fname:
                try:
                    file_prefix = fname.split('_solution_')[0]
                    solution_number = int(fname.split('_solution_')[1].split('.')[0])
                except Exception:
                    print(f"Filename parse error: {fname}")
                    continue
                compare_ocr_folders_with_cer(human_ocr_base_path, gemini_ocr_base_path, solution_folder, solution_number, file_prefix)

# Set paths to your directories
human_ocr_base_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr'
gemini_ocr_base_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr'

# Process the folders
process_folders_with_cer(human_ocr_base_path, gemini_ocr_base_path)

Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_1.json
Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_Gemini/solutions/12_1002140198994121111692513661/12_solution_1.json
Human OCR ocr_text: '1. Convex lens'
Gemini OCR ocr_text: '1) Convex lens'
Created /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/tables_cer/12_1002140198994121111692513661/12_solution_1_table.json
Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/Physics_human/solutions_chapter/12_1002140198994121111692513661/12_solution_2.json
Processing: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improv

# to send gemini to get analysis on the human vs predictions

In [2]:
import os
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import google.generativeai as genai
import sys
from pathlib import Path

# Handle both Jupyter notebook and standalone script environments
try:
    # This works in standalone Python scripts
    script_dir = Path(__file__).parent
except NameError:
    # This works in Jupyter notebooks
    script_dir = Path.cwd()

# Add the path to access prompt_store.py using relative path
project_root = script_dir.parent.parent.parent
ocr_path = project_root / "ocr"
sys.path.append(str(ocr_path))

from prompt_store import v14

# === Load API Key ===
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_GEMINI_API"))

# Set model
model_name = "gemini-2.5-pro"
model = genai.GenerativeModel(model_name)

class ProcessingTracker:
    def __init__(self):
        self.total_files = 0
        self.processed_files = 0
        self.successful_files = 0
        self.failed_files = 0
        self.json_files = 0
        self.text_files = 0
        self.errors = []
        self.start_time = None
        self.end_time = None
        self.total_json_read_time = 0
        self.gemini_processing_time = 0
        self.file_save_time = 0
    
    def start_processing(self):
        self.start_time = datetime.now()
        print(f"🚀 Started processing at {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 70)
    
    def end_processing(self):
        self.end_time = datetime.now()
        duration = self.end_time - self.start_time
        print("\n" + "=" * 70)
        print("📊 PROCESSING SUMMARY")
        print("=" * 70)
        print(f"Total JSON files found:       {self.total_files}")
        print(f"Successfully processed:       {self.successful_files}")
        print(f"Failed to process:            {self.failed_files}")
        print(f"Valid JSON outputs:           {self.json_files}")
        print(f"Text outputs (invalid JSON):  {self.text_files}")
        print(f"JSON reading time:            {self.total_json_read_time:.2f}s")
        print(f"Gemini processing time:       {self.gemini_processing_time:.2f}s")
        print(f"File saving time:             {self.file_save_time:.2f}s")
        print(f"Total processing time:        {duration}")
        print(f"Completed at:                 {self.end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        if self.errors:
            print(f"\n❌ ERRORS ENCOUNTERED ({len(self.errors)}):")
            print("-" * 50)
            for i, error in enumerate(self.errors, 1):
                print(f"{i}. {error}")
        else:
            print(f"\n✅ No errors encountered!")
        print("=" * 70)
    
    def add_error(self, error_msg):
        self.errors.append(error_msg)
        self.failed_files += 1

def send_json_and_prompt(input_json_path, prompt, output_json_dir, tracker, file_index):
    try:
        json_filename = os.path.basename(input_json_path)
        print(f"\n📄 [{file_index}/{tracker.total_files}] Processing: {json_filename}")
        
        # Step 1: Read the JSON file
        print("   📖 Reading JSON file...")
        start_time = time.time()
        with open(input_json_path, 'r', encoding='utf-8') as f:
            json_content = json.load(f)
        end_time = time.time()
        read_time = end_time - start_time
        tracker.total_json_read_time += read_time
        print(f"   ✅ JSON read in {read_time:.3f}s")

        # Step 2: Compose the prompt with JSON content
        print("   🔄 Preparing prompt...")
        full_prompt = f"{prompt}\n\n<JSON Input>\n{json.dumps(json_content, indent=2)}"

        # Step 3: Generate content with the AI model
        print("   🤖 Processing with Gemini...")
        start_time = time.time()
        response = model.generate_content(
            full_prompt,
            generation_config={"temperature": 0.2},
        )
        generated_text = response.text
        end_time = time.time()
        gemini_time = end_time - start_time
        tracker.gemini_processing_time += gemini_time
        print(f"   ⏱️  Gemini processing time: {gemini_time:.2f} seconds")

        # Step 4: Clean the response
        print("   🧹 Cleaning response...")
        if generated_text.strip().startswith('```json'):
            generated_text = generated_text.strip().removeprefix('```json').removesuffix('```').strip()
        elif generated_text.strip().startswith('```'):
            generated_text = generated_text.strip().removeprefix('```').removesuffix('```').strip()

        # Step 5: Extract metadata for output path
        solution_folder_name = os.path.basename(os.path.dirname(input_json_path))
        prefix = solution_folder_name.split('_')[0]  # Extract prefix part (e.g., "12")
        base_name = os.path.splitext(os.path.basename(input_json_path))[0]
        
        # Extract the solution number correctly from filenames like '12_solution_1_table.json'
        try:
            solution_number = base_name.split('_')[2]  # Correctly extract the solution number part
        except IndexError:
            solution_number = "unknown"
            print(f"   ⚠️  Could not extract solution number from {base_name}")

        # Step 6: Create the output folder and file path
        output_solution_dir = os.path.join(output_json_dir, solution_folder_name)
        os.makedirs(output_solution_dir, exist_ok=True)
        output_json_path = os.path.join(output_solution_dir, f"{prefix}_solution_{solution_number}_analysis.json")

        # Step 7: Save the result
        print("   💾 Saving results...")
        start_time = time.time()
        
        try:
            # Try to parse the generated text as JSON
            parsed_json = json.loads(generated_text)
            # Save as properly formatted JSON
            with open(output_json_path, 'w', encoding='utf-8') as out_file:
                json.dump(parsed_json, out_file, indent=2, ensure_ascii=False)
            print(f"   ✅ Valid JSON saved: {output_json_path}")
            tracker.json_files += 1
            tracker.successful_files += 1
            
        except json.JSONDecodeError as json_error:
            # If response is not valid JSON, save as text wrapped in JSON
            output_data = {
                "analysis_result": generated_text,
                "input_file": os.path.basename(input_json_path),
                "status": "raw_text_response",
                "json_error": str(json_error)
            }
            with open(output_json_path, 'w', encoding='utf-8') as out_file:
                json.dump(output_data, out_file, indent=2, ensure_ascii=False)
            print(f"   ⚠️  Invalid JSON, saved as wrapped text: {output_json_path}")
            print(f"   📝 JSON Error: {str(json_error)[:100]}...")
            tracker.text_files += 1
            tracker.successful_files += 1
        
        end_time = time.time()
        save_time = end_time - start_time
        tracker.file_save_time += save_time
        
        tracker.processed_files += 1

    except FileNotFoundError:
        error_msg = f"File: {json_filename} - File not found"
        print(f"   ❌ File not found: {input_json_path}")
        tracker.add_error(error_msg)
    except json.JSONDecodeError as e:
        error_msg = f"File: {json_filename} - Invalid input JSON: {str(e)}"
        print(f"   ❌ Invalid input JSON: {str(e)}")
        tracker.add_error(error_msg)
    except Exception as e:
        error_msg = f"File: {json_filename} - Processing error: {str(e)}"
        print(f"   ❌ Error processing {json_filename}: {str(e)}")
        tracker.add_error(error_msg)

# Function to process all .json files in a directory
def process_all_json_files(input_dir, output_dir, prompt):
    tracker = ProcessingTracker()
    
    if not os.path.exists(input_dir):
        print(f"❌ Input directory does not exist: {input_dir}")
        return tracker
    
    # First, count all JSON files
    print(f"🔍 Scanning for JSON files in: {input_dir}")
    json_files = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    
    tracker.total_files = len(json_files)
    print(f"📁 Found {tracker.total_files} JSON files")
    
    if tracker.total_files == 0:
        print("❌ No JSON files found in the specified directory")
        return tracker
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    print(f"📂 Output directory: {output_dir}")
    
    tracker.start_processing()
    
    # Process each JSON file
    for index, json_path in enumerate(json_files, 1):
        send_json_and_prompt(json_path, prompt, output_dir, tracker, index)
        
        # Show progress
        progress = (index / tracker.total_files) * 100
        print(f"   📈 Progress: {progress:.1f}% ({index}/{tracker.total_files})")
    
    tracker.end_processing()
    return tracker

# === Example Usage ===
if __name__ == "__main__":
    print("🎯 JSON OCR Analysis Processor with Gemini AI")
    print("=" * 70)
    
    # Input directory containing .json files (relative to script location)
    input_json_dir = script_dir / "tables"
    
    # Output directory (relative to script location)
    output_json_dir = script_dir / "table_analysis"
    
    # Convert to strings for compatibility
    input_json_dir = str(input_json_dir)
    output_json_dir = str(output_json_dir)
    
    print(f"📂 Input directory:  {input_json_dir}")
    print(f"📂 Output directory: {output_json_dir}")
    print(f"🤖 Using model:      {model_name}")
    print(f"📋 Using prompt:     v14 (OCR Quality Analysis)")
    
    # Start processing all JSON files in the input directory
    result_tracker = process_all_json_files(input_json_dir, output_json_dir, v14)
    
    # Final status
    if result_tracker.total_files > 0:
        success_rate = (result_tracker.successful_files / result_tracker.total_files) * 100
        print(f"\n🎉 Overall success rate: {success_rate:.1f}%")
        
        if result_tracker.failed_files > 0:
            print(f"⚠️  {result_tracker.failed_files} files failed to process")
        else:
            print("🎊 All files processed successfully!")
    else:
        print("❌ No files were processed")

# For Jupyter notebook usage
def run_json_analysis():
    """Function to call from Jupyter notebook"""
    print("🎯 JSON OCR Analysis Processor with Gemini AI")
    print("=" * 70)
    
    # Input directory containing .json files (relative to current working directory)
    input_json_dir = script_dir / "tables"
    
    # Output directory (relative to current working directory)
    output_json_dir = script_dir / "table_analysis"
    
    # Convert to strings for compatibility
    input_json_dir = str(input_json_dir)
    output_json_dir = str(output_json_dir)
    
    print(f"📂 Input directory:  {input_json_dir}")
    print(f"📂 Output directory: {output_json_dir}")
    print(f"🤖 Using model:      {model_name}")
    print(f"📋 Using prompt:     v14 (OCR Quality Analysis)")
    
    # Start processing all JSON files in the input directory
    result_tracker = process_all_json_files(input_json_dir, output_json_dir, v14)
    
    # Final status
    if result_tracker.total_files > 0:
        success_rate = (result_tracker.successful_files / result_tracker.total_files) * 100
        print(f"\n🎉 Overall success rate: {success_rate:.1f}%")
        
        if result_tracker.failed_files > 0:
            print(f"⚠️  {result_tracker.failed_files} files failed to process")
        else:
            print("🎊 All files processed successfully!")
    else:
        print("❌ No files were processed")
    
    return result_tracker

🎯 JSON OCR Analysis Processor with Gemini AI
📂 Input directory:  /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/tables
📂 Output directory: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/table_analysis
🤖 Using model:      gemini-2.5-pro
📋 Using prompt:     v14 (OCR Quality Analysis)
🔍 Scanning for JSON files in: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/tables
📁 Found 111 JSON files
📂 Output directory: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/table_analysis
🚀 Started processing at 2025-07-30 15:02:42

📄 [1/111] Processing: 12_solution_2_table.json
   📖 Reading JSON file...
   ✅ JSON read in 0.000s
   🔄 Preparing prompt...
   🤖 Processing with Gemini...


KeyboardInterrupt: 

## merge the cer and gemini tables

In [81]:
import os
import json

base_dir = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics"
analysis_dir = os.path.join(base_dir, "table_analysis")
cer_dir = os.path.join(base_dir, "tables_cer")  # assuming this has the JSON files with CER data
final_dir = os.path.join(base_dir, "final_tables")

def read_json_file(json_path):
    """Read and return JSON data from file"""
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except (json.JSONDecodeError, FileNotFoundError) as e:
        print(f"Error reading {json_path}: {e}")
        return None

def write_json_file(json_path, data):
    """Write JSON data to file"""
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def merge_json_data(analysis_data, cer_data):
    """Merge analysis data with CER data"""
    # Start with CER data as base
    merged_data = cer_data.copy() if cer_data else {}
    
    # Add analysis fields from analysis_data
    if analysis_data:
        # Add analysis-specific fields
        merged_data.update({
            "type_of_error": analysis_data.get("type_of_error", "N/A"),
            "discrepancy_analysis": analysis_data.get("discrepancy_analysis", "N/A"),
            "has_errors": analysis_data.get("has_errors", False)
        })
    else:
        # If no analysis data, set defaults
        merged_data.update({
            "type_of_error": "N/A",
            "discrepancy_analysis": "N/A", 
            "has_errors": False
        })
    
    return merged_data

# Loop over all subfolders in table_analysis
for folder_id in os.listdir(analysis_dir):
    analysis_path = os.path.join(analysis_dir, folder_id)
    cer_path = os.path.join(cer_dir, folder_id)
    out_path = os.path.join(final_dir, folder_id)
    
    if not os.path.isdir(analysis_path):
        continue
    
    # Check if corresponding cer folder exists, if not skip
    if not os.path.isdir(cer_path):
        print(f"Skipping {folder_id}: no matching folder in tables directory")
        continue
        
    os.makedirs(out_path, exist_ok=True)

    for fname in os.listdir(analysis_path):
        if not fname.endswith(".json"):
            continue
            
        analysis_file = os.path.join(analysis_path, fname)
        
        # Create corresponding CER file name (adjust naming as needed)
        # Assuming analysis files end with '_analysis.json' and cer files end with '_table.json'
        cer_fname = fname.replace("_analysis.json", "_table.json")
        cer_file = os.path.join(cer_path, cer_fname)
        
        if not os.path.exists(cer_file):
            print(f"Skipping {fname} in {folder_id}: no matching file {cer_fname} in tables")
            continue

        # Read both JSON files
        analysis_data = read_json_file(analysis_file)
        cer_data = read_json_file(cer_file)
        
        if analysis_data is None and cer_data is None:
            print(f"Skipping {fname} in {folder_id}: both files failed to load")
            continue

        # Merge the data
        merged_data = merge_json_data(analysis_data, cer_data)
        
        # Create output filename
        out_fname = fname.replace("analysis", "final").replace("solution", "final")
        out_file = os.path.join(out_path, out_fname)
        
        # Write merged JSON
        write_json_file(out_file, merged_data)
        print(f"Saved combined JSON: {out_file}")

print("JSON merging completed!")

Saved combined JSON: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/final_tables/12_1002140198994121111692513661/12_final_1_final.json
Saved combined JSON: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/final_tables/12_1002140198994121111692513661/12_final_2_final.json
JSON merging completed!


## make a final table 

In [82]:
import os
import json

base_dir = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics"
final_tables_dir = os.path.join(base_dir, "final_tables")
output_file = os.path.join(base_dir, "final_table.json")

all_data = []

def read_json_file(json_path):
    """Read and return JSON data from file"""
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except (json.JSONDecodeError, FileNotFoundError) as e:
        print(f"Error reading {json_path}: {e}")
        return None

for folder_id in os.listdir(final_tables_dir):
    folder_path = os.path.join(final_tables_dir, folder_id)
    if not os.path.isdir(folder_path):
        continue
    
    for fname in os.listdir(folder_path):
        if not fname.endswith(".json"):
            continue
            
        file_path = os.path.join(folder_path, fname)
        json_data = read_json_file(file_path)
        
        if json_data is None:
            print(f"Warning: {file_path} could not be read.")
            continue
        
        # Add filename to the data for tracking
        json_data["file_name"] = fname
        json_data["folder_id"] = folder_id
        
        # Add to all_data list
        all_data.append(json_data)

# Write merged JSON
output_data = {
    "total_files": len(all_data),
    "source_directory": final_tables_dir,
    "merged_data": all_data
}

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"Merged {len(all_data)} JSON files written to {output_file}")

Merged 2 JSON files written to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/final_table.json


## make a average cer and no of errors analysis

In [83]:
import os
import json

input_file = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/final_table.json"

cer_values = []
na_count = 0
row_count = 0

# Read JSON file
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Check if the expected structure exists
if "merged_data" not in data:
    raise Exception("No 'merged_data' found in JSON!")

merged_data = data["merged_data"]

# Process each JSON object in merged_data
for item in merged_data:
    row_count += 1
    
    # Check if 'cer' field exists
    if "cer" not in item:
        print(f"Warning: No 'cer' field in item {row_count}")
        continue
    
    cer_val = item["cer"]
    
    # Handle different types of values
    if cer_val is None or (isinstance(cer_val, str) and cer_val.lower() == "na"):
        na_count += 1
    else:
        try:
            # Convert to float if it's not already
            if isinstance(cer_val, str):
                cer_float = float(cer_val)
            else:
                cer_float = float(cer_val)
            cer_values.append(cer_float)
        except (ValueError, TypeError):
            print(f"Warning: Could not convert CER value '{cer_val}' to float in item {row_count}")
            na_count += 1

# Calculate average
average_cer = sum(cer_values) / len(cer_values) if cer_values else 0

print(f"Total data items: {row_count}")
print(f"Total 'na' in cer field: {na_count}")
print(f"Valid CER values: {len(cer_values)}")
print(f"Average cer (excluding 'na'): {average_cer}")

# Additional statistics
if cer_values:
    min_cer = min(cer_values)
    max_cer = max(cer_values)
    print(f"Minimum CER: {min_cer}")
    print(f"Maximum CER: {max_cer}")

Total data items: 2
Total 'na' in cer field: 0
Valid CER values: 2
Average cer (excluding 'na'): 0.08944658944658944
Minimum CER: 0.036036036036036036
Maximum CER: 0.14285714285714285


In [85]:
import os
import json
from collections import Counter

input_file = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/final_table.json"

# List of error types to count
error_types = [
    "Spelling",
    "Wording", 
    "Extra Content",
    "Punctuation",
    "Numerical Difference",
    "Missing Content",
    "Content Mix-up",
    "Omission"
]

counts = Counter()

# Read JSON file
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Check if the expected structure exists
if "merged_data" not in data:
    raise Exception("No 'merged_data' found in JSON!")

merged_data = data["merged_data"]

# Look for the required fields in the data
required_fields = ['type_of_error', 'gemini_text', 'human_text']
alternative_fields = {
    'type_of_error': ['type_of_error', 'error_type', 'Type of Error'],
    'gemini_text': ['gemini_text', 'gemini_ocr', 'Gemini_OCR'],
    'human_text': ['human_text', 'human_ocr', 'Human_OCR']
}

# Find the correct field names
field_mapping = {}
if merged_data:
    sample_item = merged_data[0]
    for field_type, possible_names in alternative_fields.items():
        found_field = None
        for name in possible_names:
            if name in sample_item:
                found_field = name
                break
        
        if found_field:
            field_mapping[field_type] = found_field
        else:
            print(f"Warning: Could not find field for {field_type}")
            print(f"Available fields: {list(sample_item.keys())}")

print("=== Field Mapping ===")
for field_type, field_name in field_mapping.items():
    print(f"{field_type} -> {field_name}")

print(f"\n=== Error Type Analysis ===")
print(f"Total items to process: {len(merged_data)}")

# Process each JSON object in merged_data
processed_count = 0
skipped_count = 0

for i, item in enumerate(merged_data, 1):
    # Check if all required fields exist
    missing_fields = []
    for field_type in ['type_of_error', 'gemini_text', 'human_text']:
        if field_type not in field_mapping or field_mapping[field_type] not in item:
            missing_fields.append(field_type)
    
    if missing_fields:
        print(f"Warning: Item {i} missing fields: {missing_fields}")
        skipped_count += 1
        continue
    
    # Get values using the mapped field names
    error_val = item[field_mapping['type_of_error']]
    gemini_ocr = item[field_mapping['gemini_text']]
    human_ocr = item[field_mapping['human_text']]
    
    # Convert to string if needed and handle None values
    error_val = str(error_val) if error_val is not None else "N/A"
    gemini_ocr = str(gemini_ocr) if gemini_ocr is not None else "NA"
    human_ocr = str(human_ocr) if human_ocr is not None else "NA"
    
    # Check if either gemini_ocr or human_ocr is "NA"
    if gemini_ocr.lower() == "na" or human_ocr.lower() == "na":
        skipped_count += 1
        continue  # Skip this row if either OCR value is "NA"
    
    processed_count += 1
    
    # Count the error types only if the error type is explicitly mentioned
    if error_val == "NO ERRORS":
        counts[error_val] += 1
    elif error_val in error_types:
        counts[error_val] += 1
    else:
        # Count any other error types that might exist
        if error_val not in ["N/A", "", "null"]:
            counts["Other"] += 1
            print(f"Found unexpected error type: '{error_val}' in item {i}")

print(f"\nProcessed items: {processed_count}")
print(f"Skipped items: {skipped_count}")

# Print results
print(f"\n=== Error Type Counts ===")
for error_type in error_types + ["NO ERRORS"]:
    print(f"{error_type}: {counts[error_type]}")

if counts["Other"] > 0:
    print(f"Other/Unexpected: {counts['Other']}")

# Calculate percentages
total_valid = sum(counts.values())
if total_valid > 0:
    print(f"\n=== Percentages (of {total_valid} valid items) ===")
    for error_type in error_types + ["NO ERRORS"]:
        if counts[error_type] > 0:
            percentage = (counts[error_type] / total_valid) * 100
            print(f"{error_type}: {percentage:.1f}%")

=== Field Mapping ===
type_of_error -> type_of_error
gemini_text -> gemini_text
human_text -> human_text

=== Error Type Analysis ===
Total items to process: 2
Found unexpected error type: 'Punctuation, Wording' in item 2

Processed items: 2
Skipped items: 0

=== Error Type Counts ===
Spelling: 0
Wording: 0
Extra Content: 0
Punctuation: 1
Numerical Difference: 0
Missing Content: 0
Content Mix-up: 0
Omission: 0
NO ERRORS: 0
Other/Unexpected: 1

=== Percentages (of 2 valid items) ===
Punctuation: 50.0%


In [89]:
import os
import json
from collections import defaultdict

input_file = "/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/z2_ocr/Physics/final_table.json"

# Read JSON file
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Check if the expected structure exists
if "merged_data" not in data:
    raise Exception("No 'merged_data' found in JSON!")

merged_data = data["merged_data"]

# Look for folder_id field (with possible variations)
folder_id_fields = ['folder_id', 'folderId', 'FOLDER_ID', 'folder_name']
folder_id_field = None

if merged_data:
    sample_item = merged_data[0]
    for field in folder_id_fields:
        if field in sample_item:
            folder_id_field = field
            break

if folder_id_field is None:
    print("Available fields in the data:")
    if merged_data:
        for key in merged_data[0].keys():
            print(f"  - {key}")
    raise Exception("No folder_id field found in JSON data!")

print(f"Using field '{folder_id_field}' for folder IDs")
print(f"Total items to process: {len(merged_data)}")

# Initialize counter for all possible numbers (1-99 to be safe)
prefix_counts = defaultdict(int)

# Process each JSON object in merged_data
processed_count = 0
unmatched_folders = []

for i, item in enumerate(merged_data, 1):
    if folder_id_field not in item:
        print(f"Warning: Item {i} missing {folder_id_field} field")
        continue
    
    folder_id = item[folder_id_field]
    
    # Convert to string if needed
    folder_id = str(folder_id) if folder_id is not None else ""
    
    if not folder_id:
        print(f"Warning: Empty folder_id in item {i}")
        continue
    
    processed_count += 1
    
    # Extract numeric prefix (part before first underscore)
    if '_' in folder_id:
        prefix = folder_id.split('_')[0]
        
        # Check if prefix is numeric
        if prefix.isdigit():
            # Convert to int to remove leading zeros, then back to string
            numeric_prefix = str(int(prefix))
            prefix_counts[numeric_prefix] += 1
        else:
            unmatched_folders.append(folder_id)
    else:
        unmatched_folders.append(folder_id)

print(f"\nProcessed items: {processed_count}")
print(f"Items in merged_data: {len(merged_data)}")

# Find the range of numbers we actually have
if prefix_counts:
    max_num = max(int(k) for k in prefix_counts.keys())
    min_num = min(int(k) for k in prefix_counts.keys())
else:
    max_num = 15  # default range
    min_num = 1

# Extend range to show a reasonable range (1 to at least 15)
max_num = max(max_num, 15)

print(f"\n=== Folder ID Prefix Counts ===")
total_files = 0
zero_counts = []

for num in range(1, max_num + 1):
    count = prefix_counts[str(num)]
    total_files += count
    print(f"{num}_: {count}")
    
    if count == 0:
        zero_counts.append(str(num))

print(f"\nTotal files counted: {total_files}")

# Show prefixes with zero counts
if zero_counts:
    print(f"Prefixes with 0 files: {', '.join(zero_counts)}")

# Show unmatched folder IDs for debugging
if unmatched_folders:
    print(f"\n=== Unmatched Folder IDs ===")
    unique_unmatched = list(set(unmatched_folders))[:10]  # Show first 10 unique
    for folder in unique_unmatched:
        print(f"  {folder}")
    if len(unmatched_folders) > 10:
        print(f"  ... and {len(unmatched_folders) - 10} more")

# Show some sample folder IDs for debugging
print(f"\n=== Sample Folder IDs ===")
sample_count = min(5, len(merged_data))
for i in range(sample_count):
    if folder_id_field in merged_data[i]:
        folder_id = merged_data[i][folder_id_field]
        prefix = folder_id.split('_')[0] if '_' in str(folder_id) else "no_underscore"
        print(f"  {i+1}: {folder_id} -> prefix: {prefix}")

Using field 'folder_id' for folder IDs
Total items to process: 2

Processed items: 2
Items in merged_data: 2

=== Folder ID Prefix Counts ===
1_: 0
2_: 0
3_: 0
4_: 0
5_: 0
6_: 0
7_: 0
8_: 0
9_: 0
10_: 0
11_: 0
12_: 2
13_: 0
14_: 0
15_: 0

Total files counted: 2
Prefixes with 0 files: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15

=== Sample Folder IDs ===
  1: 12_1002140198994121111692513661 -> prefix: 12
  2: 12_1002140198994121111692513661 -> prefix: 12
