## to break the json file into muliple sections

In [1]:
import os
import json
from docx import Document
import shutil

# Define the base paths
base_input_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/gemini_2.5_pro_768/phy'
base_output_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/gemini_768_ocr'

# Function to create the necessary folder structure and save .docx files
def process_json_files(input_base_dir, output_base_dir):
    # Walk through the input directory to find all JSON files
    for root, dirs, files in os.walk(input_base_dir):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(root, file)
                
                # Read the JSON file
                with open(json_path, 'r') as json_file:
                    data = json.load(json_file)
                
                # Extract the question text from the JSON file
                question_texts = [entry['question_text'] for entry in data]
                
                # Create the output directory structure
                folder_name = os.path.splitext(file)[0]  # Use the JSON file name without extension
                output_folder_path = os.path.join(output_base_dir, folder_name)
                
                if not os.path.exists(output_folder_path):
                    os.makedirs(output_folder_path)
                
                # Loop through each question and create section docx files
                for idx, question_text in enumerate(question_texts, 1):
                    # Create a Document
                    doc = Document()
                    doc.add_paragraph(question_text)
                    
                    # Create section_id.docx and save it in the output directory
                    section_file_path = os.path.join(output_folder_path, f'section_{idx}.docx')
                    doc.save(section_file_path)
                    print(f"Saved {section_file_path}")
                    
                print(f"Processed JSON: {json_path} and saved to {output_folder_path}")

# Run the function
process_json_files(base_input_path, base_output_path)


Saved /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/gemini_768_ocr/12_1002140198994121111692513661/section_1.docx
Saved /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/gemini_768_ocr/12_1002140198994121111692513661/section_2.docx
Saved /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/gemini_768_ocr/12_1002140198994121111692513661/section_3.docx
Saved /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/gemini_768_ocr/12_1002140198994121111692513661/section_4.docx
Processed JSON: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/gemini_2.5_pro_768/phy/12_1002140198994121111692513661/12_100214

## to make docx split into different docx section files

In [3]:
from docx import Document
from docx.shared import Inches
import zipfile
import os
import re
from PIL import Image  # For image format verification

# Function to remove content between < and >, including space and equal signs
def clean_text(text):
    # Remove all tags like <...> even if they span multiple lines or have extra spaces
    text = re.sub(r'<[^>]*>', '', text, flags=re.DOTALL)
    return text.strip()

# Helper to get all block items (paragraphs and tables) in order
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph

# Function to extract images from DOCX
def extract_images_from_docx(docx_file_path, output_image_dir):
    with zipfile.ZipFile(docx_file_path, 'r') as docx_zip:
        image_dir = 'word/media/'
        image_files = [f for f in docx_zip.namelist() if f.startswith(image_dir)]
        if not image_files:
            print(f"No images found in the DOCX file: {docx_file_path}")
            return []
        extracted_images = []
        for image_file in image_files:
            image_data = docx_zip.read(image_file)
            image_filename = os.path.join(output_image_dir, os.path.basename(image_file))
            with open(image_filename, 'wb') as img_file:
                img_file.write(image_data)
            # Check if the image is of a supported format
            try:
                # Attempt to open the image to verify it's a valid format
                with Image.open(image_filename) as img:
                    img.verify()  # Verify the image
                extracted_images.append(image_filename)
            except (IOError, SyntaxError) as e:
                print(f"Skipping unsupported or corrupted image: {image_filename}")
                os.remove(image_filename)  # Optionally delete the invalid image
        return extracted_images

# Function to add image to new document
def add_image_to_doc(image_path, new_doc):
    try:
        new_doc.add_paragraph(f"Image: {os.path.basename(image_path)}")
        new_doc.add_picture(image_path, width=Inches(2))
    except Exception as e:
        print(f"Error adding image {image_path}: {e}")

# Function to copy table to new document
def copy_table(table, new_doc):
    new_table = new_doc.add_table(rows=0, cols=len(table.columns))
    for row in table.rows:
        new_row = new_table.add_row().cells
        for idx, cell in enumerate(row.cells):
            new_row[idx].text = cell.text

# Helper function to iterate through blocks (paragraphs and tables)
def iter_block_items(parent):
    for child in parent.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

# Function to process sections for each DOCX file in the directory and save as separate files
def process_directory(input_dir, output_dir):
    # List all DOCX files in the input directory
    docx_files = [f for f in os.listdir(input_dir) if f.endswith('.docx')]
    
    # Process each DOCX file in the directory
    for docx_file in docx_files:
        input_file_path = os.path.join(input_dir, docx_file)
        print(f"Processing {input_file_path}")
        
        # Create a folder for each DOCX file using its base name
        base_name = os.path.splitext(docx_file)[0]
        output_section_dir = os.path.join(output_dir, base_name)
        os.makedirs(output_section_dir, exist_ok=True)
        
        # Create an images folder inside this directory
        output_image_dir = os.path.join(output_section_dir, "images")
        os.makedirs(output_image_dir, exist_ok=True)
        
        doc = Document(input_file_path)
        extracted_images = extract_images_from_docx(input_file_path, output_image_dir)
        image_index = 0
        inside_sol = False
        found_start = False
        section_id = None

        # Loop over all sections and create separate documents
        for block in iter_block_items(doc):
            if isinstance(block, Paragraph):
                text = block.text
                cleaned_text = clean_text(text)  # Clean the text using the updated clean_text function

                # Find start of section based on ID tag
                if '<sol_start id=' in text and not found_start:
                    inside_sol = True
                    found_start = True
                    # Extract the section ID
                    section_id = re.search(r'<sol_start id=(\d+)>', text).group(1)
                    new_doc = Document()  # New document for each section
                    continue

                # End of section
                if '<sol_end>' in text and inside_sol:
                    inside_sol = False
                    if section_id:
                        # Save the document with section ID inside the folder created for the input file
                        output_file_path = os.path.join(output_section_dir, f"section_{section_id}.docx")
                        new_doc.save(output_file_path)
                        print(f"Extracted content for section {section_id} saved to: {output_file_path}")
                    found_start = False
                    continue

                # Add content inside section
                if inside_sol:
                    image_added = False
                    text_before_image = ""

                    for run in block.runs:
                        # Check if there's an image in this run
                        if run._r.xml.find('graphic') != -1:
                            if image_index < len(extracted_images):
                                # First add any text before the image
                                if text_before_image:
                                    new_doc.add_paragraph(text_before_image)
                                # Add the image to the document
                                add_image_to_doc(extracted_images[image_index], new_doc)
                                image_index += 1
                            image_added = True
                            text_before_image = ""  # Reset the text before the image after adding the image
                        else:
                            # Accumulate the text before the image
                            text_before_image += run.text

                    # If there's remaining text before the image (but no image in this paragraph)
                    if text_before_image:
                        new_doc.add_paragraph(clean_text(text_before_image))
                    else:
                        new_doc.add_paragraph(cleaned_text)  # Ensure cleaned text is added here

            elif isinstance(block, Table) and inside_sol:
                copy_table(block, new_doc)


# Paths
input_dir = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/Converting Handwriting PDF to Word File/Physics Word File'
output_dir = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/human_ocr'

# Process the directory containing DOCX files
process_directory(input_dir, output_dir)

Processing /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/Converting Handwriting PDF to Word File/Physics Word File/10_10021138351083421111694954514.docx
Extracted content for section 1 saved to: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/human_ocr/10_10021138351083421111694954514/section_1.docx
Extracted content for section 2 saved to: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/human_ocr/10_10021138351083421111694954514/section_2.docx
Extracted content for section 3 saved to: /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evalutation_phy/human_ocr/10_10021138351083421111694954514/section_3.docx
Extracted content for section 4 saved to: /Users/simrannaik/Desktop/solution_im

## to make a final table of docx for each quesition

In [None]:

from docx import Document
from docx.shared import Inches
import os
import re
import zipfile
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
from PIL import Image

input_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/humans_ocr'
gemini_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/gemini_768_ocr'
output_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables'


try:
    import Levenshtein  # pip install python-Levenshtein
    def cer(ref, hyp):
        if not ref:
            return 1.0
        return Levenshtein.distance(ref, hyp) / len(ref)
except ImportError:
    def cer(ref, hyp):
        # Fallback: simple implementation
        import difflib
        sm = difflib.SequenceMatcher(None, ref, hyp)
        return 1 - sm.ratio() if ref else 1.0

def extract_text_from_docx(docx_path):
    if not os.path.exists(docx_path):
        return ""
    doc = Document(docx_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', '', text)  # Remove all whitespace
    return text

def extract_images_from_docx(docx_file_path, output_image_dir):
    with zipfile.ZipFile(docx_file_path, 'r') as docx_zip:
        image_dir = 'word/media/'
        image_files = [f for f in docx_zip.namelist() if f.startswith(image_dir)]
        if not image_files:
            return []
        extracted_images = []
        for image_file in image_files:
            image_data = docx_zip.read(image_file)
            image_filename = os.path.join(output_image_dir, os.path.basename(image_file))
            with open(image_filename, 'wb') as img_file:
                img_file.write(image_data)
            try:
                with Image.open(image_filename) as img:
                    img.verify()
                extracted_images.append(image_filename)
            except Exception:
                os.remove(image_filename)
        return extracted_images

def copy_table(table, cell_doc):
    new_table = cell_doc.add_table(rows=0, cols=len(table.columns))
    for row in table.rows:
        new_row = new_table.add_row().cells
        for idx, cell in enumerate(row.cells):
            new_row[idx].text = cell.text

def iter_block_items(parent):
    for child in parent.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def add_content_to_cell(doc_path, cell, image_dir):
    if not os.path.exists(doc_path):
        cell.add_paragraph("na")
        return
    doc = Document(doc_path)
    images = extract_images_from_docx(doc_path, image_dir)
    image_idx = 0
    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            p = cell.add_paragraph(block.text)
            for run in block.runs:
                if 'graphic' in run._r.xml and image_idx < len(images):
                    try:
                        cell.add_paragraph().add_run().add_picture(images[image_idx], width=Inches(2))
                        image_idx += 1
                    except Exception as e:
                        print(f"Error adding image: {e}")
        elif isinstance(block, Table):
            copy_table(block, cell)

# Loop through all subfolders in humans_ocr
for folder_name in os.listdir(input_base):
    input_dir = os.path.join(input_base, folder_name)
    if not os.path.isdir(input_dir):
        continue
    output_dir = os.path.join(output_base, folder_name)
    os.makedirs(output_dir, exist_ok=True)
    section_files = sorted(
        [f for f in os.listdir(input_dir) if f.startswith('section_') and f.endswith('.docx') and not f.endswith('_table.docx')],
        key=lambda x: int(x.split('_')[1].split('.')[0])
    )
    for section_file in section_files:
        section_path = os.path.join(input_dir, section_file)
        gemini_section_path = os.path.join(gemini_base, folder_name, section_file)
        # Prepare image extraction dirs
        image_dir_human = os.path.join(output_dir, "images_human")
        image_dir_gemini = os.path.join(output_dir, "images_gemini")
        os.makedirs(image_dir_human, exist_ok=True)
        os.makedirs(image_dir_gemini, exist_ok=True)
        # Create a new document for this table
        table_doc = Document()
        table = table_doc.add_table(rows=2, cols=3)
        table.style = 'Table Grid'
        table.cell(0, 0).text = 'Human OCR'
        table.cell(0, 1).text = 'Gemini OCR'
        table.cell(0, 2).text = 'CER (Character Error Rate)'
        # Add all content to the first cell of the second row (Human OCR)
        cell_human = table.cell(1, 0)
        cell_human._element.clear_content()
        add_content_to_cell(section_path, cell_human, image_dir_human)
        # Add all content to the second cell of the second row (Gemini OCR)
        cell_gemini = table.cell(1, 1)
        cell_gemini._element.clear_content()
        add_content_to_cell(gemini_section_path, cell_gemini, image_dir_gemini)
        # CER calculation
        human_text = normalize_text(extract_text_from_docx(section_path))
        gemini_text = normalize_text(extract_text_from_docx(gemini_section_path))
        if not human_text or not gemini_text or human_text == 'na' or gemini_text == 'na':
            cer_value = 'na'
        else:
            cer_value = cer(human_text, gemini_text)
            table.cell(1, 2).text = f"{cer_value*100:.1f}%"
        # Save the table docx in the output subfolder with _table suffix
        table_filename = section_file.replace('.docx', '_table.docx')
        table_path = os.path.join(output_dir, table_filename)
        table_doc.save(table_path)
        print(f"Saved table for {section_file} to {table_path}")

Saved table for section_1.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_1_table.docx
Saved table for section_2.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_2_table.docx
Saved table for section_3.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_3_table.docx
Saved table for section_4.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_4_table.docx
Saved table for section_1.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_gr

In [26]:
from docx import Document
from docx.shared import Inches
import os
import re
import zipfile
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
from PIL import Image

input_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/humans_ocr'
gemini_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/gemini_768_ocr'
output_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables'

try:
    import Levenshtein
    def cer(ref, hyp):
        if not ref:
            return 1.0
        return Levenshtein.distance(ref, hyp) / len(ref)
except ImportError:
    def cer(ref, hyp):
        import difflib
        sm = difflib.SequenceMatcher(None, ref, hyp)
        return 1 - sm.ratio() if ref else 1.0

def extract_text_from_docx(docx_path):
    if not os.path.exists(docx_path):
        return ""
    doc = Document(docx_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', '', text)
    return text

def extract_images_from_docx(docx_file_path, output_image_dir):
    with zipfile.ZipFile(docx_file_path, 'r') as docx_zip:
        image_dir = 'word/media/'
        image_files = [f for f in docx_zip.namelist() if f.startswith(image_dir)]
        if not image_files:
            return []
        extracted_images = []
        for image_file in image_files:
            image_data = docx_zip.read(image_file)
            image_filename = os.path.join(output_image_dir, os.path.basename(image_file))
            with open(image_filename, 'wb') as img_file:
                img_file.write(image_data)
            try:
                with Image.open(image_filename) as img:
                    img.verify()
                extracted_images.append(image_filename)
            except Exception:
                os.remove(image_filename)
        return extracted_images

def copy_table(table, cell_doc):
    new_table = cell_doc.add_table(rows=0, cols=len(table.columns))
    for row in table.rows:
        new_row = new_table.add_row().cells
        for idx, cell in enumerate(row.cells):
            new_row[idx].text = cell.text

def iter_block_items(parent):
    for child in parent.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def add_content_to_cell(doc_path, cell, image_dir):
    if not os.path.exists(doc_path):
        cell.add_paragraph("na")
        return
    doc = Document(doc_path)
    images = extract_images_from_docx(doc_path, image_dir)
    image_idx = 0
    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            p = cell.add_paragraph(block.text)
            for run in block.runs:
                if 'graphic' in run._r.xml and image_idx < len(images):
                    try:
                        cell.add_paragraph().add_run().add_picture(images[image_idx], width=Inches(2))
                        image_idx += 1
                    except Exception as e:
                        print(f"Error adding image: {e}")
        elif isinstance(block, Table):
            copy_table(block, cell)

# Logging variables
total_folders = 0
total_sections_human = 0
total_sections_gemini = 0
total_sections_both = 0
total_processed = 0
missing_in_human = []
missing_in_gemini = []
processed_sections = []
skipped_sections = []

# Loop through all subfolders in humans_ocr
for folder_name in os.listdir(input_base):
    input_dir = os.path.join(input_base, folder_name)
    gemini_dir = os.path.join(gemini_base, folder_name)
    if not os.path.isdir(input_dir):
        continue
    total_folders += 1
    output_dir = os.path.join(output_base, folder_name)
    os.makedirs(output_dir, exist_ok=True)

    # Get all section files in both folders
    human_sections = set(
        f for f in os.listdir(input_dir)
        if f.startswith('section_') and f.endswith('.docx') and not f.endswith('_table.docx')
    )
    gemini_sections = set(
        f for f in os.listdir(gemini_dir)
        if f.startswith('section_') and f.endswith('.docx') and not f.endswith('_table.docx')
    ) if os.path.exists(gemini_dir) else set()

    total_sections_human += len(human_sections)
    total_sections_gemini += len(gemini_sections)

    # Sections present in both
    common_sections = sorted(human_sections & gemini_sections, key=lambda x: int(x.split('_')[1].split('.')[0]))
    total_sections_both += len(common_sections)

    # Log missing
    missing_in_human.extend([f"{folder_name}/{s}" for s in gemini_sections - human_sections])
    missing_in_gemini.extend([f"{folder_name}/{s}" for s in human_sections - gemini_sections])

    for section_file in common_sections:
        section_path = os.path.join(input_dir, section_file)
        gemini_section_path = os.path.join(gemini_dir, section_file)
        image_dir_human = os.path.join(output_dir, "images_human")
        image_dir_gemini = os.path.join(output_dir, "images_gemini")
        os.makedirs(image_dir_human, exist_ok=True)
        os.makedirs(image_dir_gemini, exist_ok=True)
        try:
            table_doc = Document()
            table = table_doc.add_table(rows=2, cols=3)
            table.style = 'Table Grid'
            table.cell(0, 0).text = 'Human OCR'
            table.cell(0, 1).text = 'Gemini OCR'
            table.cell(0, 2).text = 'CER (Character Error Rate)'
            cell_human = table.cell(1, 0)
            cell_human._element.clear_content()
            add_content_to_cell(section_path, cell_human, image_dir_human)
            cell_gemini = table.cell(1, 1)
            cell_gemini._element.clear_content()
            add_content_to_cell(gemini_section_path, cell_gemini, image_dir_gemini)
            human_text = normalize_text(extract_text_from_docx(section_path))
            gemini_text = normalize_text(extract_text_from_docx(gemini_section_path))
            if not human_text or not gemini_text or human_text == 'na' or gemini_text == 'na':
                cer_value = 'na'
            else:
                cer_value = cer(human_text, gemini_text)
                table.cell(1, 2).text = f"{cer_value*100:.1f}%"
            table_filename = section_file.replace('.docx', '_table.docx')
            table_path = os.path.join(output_dir, table_filename)
            table_doc.save(table_path)
            print(f"Saved table for {folder_name}/{section_file} to {table_path}")
            total_processed += 1
            processed_sections.append(f"{folder_name}/{section_file}")
        except Exception as e:
            print(f"Error processing {folder_name}/{section_file}: {e}")
            skipped_sections.append(f"{folder_name}/{section_file}")

# Print summary
print("\n==== SUMMARY ====")
print(f"Total folders processed: {total_folders}")
print(f"Total section files in Human OCR: {total_sections_human}")
print(f"Total section files in Gemini OCR: {total_sections_gemini}")
print(f"Total section files present in BOTH: {total_sections_both}")
print(f"Total section files processed (tables created): {total_processed}")
if missing_in_human:
    print(f"\nSection files missing in Human OCR (present in Gemini but not Human):")
    for s in missing_in_human:
        print("  -", s)
if missing_in_gemini:
    print(f"\nSection files missing in Gemini OCR (present in Human but not Gemini):")
    for s in missing_in_gemini:
        print("  -", s)
if skipped_sections:
    print(f"\nSection files that failed to process (error):")
    for s in skipped_sections:
        print("  -", s)
print("=================\n")

Saved table for 01_1002115268961841141690701450/section_1.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_1_table.docx
Saved table for 01_1002115268961841141690701450/section_2.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_2_table.docx
Saved table for 01_1002115268961841141690701450/section_3.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_3_table.docx
Saved table for 01_1002115268961841141690701450/section_4.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/s

In [27]:
from docx import Document
from docx.shared import Inches
import os
import re
import zipfile
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
from PIL import Image

input_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/humans_ocr'
gemini_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/gemini_768_ocr'
output_base = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables'

try:
    import Levenshtein
    def cer(ref, hyp):
        if not ref:
            return 1.0
        return Levenshtein.distance(ref, hyp) / len(ref)
except ImportError:
    def cer(ref, hyp):
        import difflib
        sm = difflib.SequenceMatcher(None, ref, hyp)
        return 1 - sm.ratio() if ref else 1.0

def extract_text_from_docx(docx_path):
    if not os.path.exists(docx_path):
        return ""
    doc = Document(docx_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', '', text)
    return text

def extract_images_from_docx(docx_file_path, output_image_dir):
    with zipfile.ZipFile(docx_file_path, 'r') as docx_zip:
        image_dir = 'word/media/'
        image_files = [f for f in docx_zip.namelist() if f.startswith(image_dir)]
        if not image_files:
            return []
        extracted_images = []
        for image_file in image_files:
            image_data = docx_zip.read(image_file)
            image_filename = os.path.join(output_image_dir, os.path.basename(image_file))
            with open(image_filename, 'wb') as img_file:
                img_file.write(image_data)
            try:
                with Image.open(image_filename) as img:
                    img.verify()
                extracted_images.append(image_filename)
            except Exception:
                os.remove(image_filename)
        return extracted_images

def copy_table(table, cell_doc):
    new_table = cell_doc.add_table(rows=0, cols=len(table.columns))
    for row in table.rows:
        new_row = new_table.add_row().cells
        for idx, cell in enumerate(row.cells):
            new_row[idx].text = cell.text

def iter_block_items(parent):
    for child in parent.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def add_content_to_cell(doc_path, cell, image_dir):
    if not os.path.exists(doc_path):
        cell.add_paragraph("na")
        return
    doc = Document(doc_path)
    images = extract_images_from_docx(doc_path, image_dir)
    image_idx = 0
    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            p = cell.add_paragraph(block.text)
            for run in block.runs:
                if 'graphic' in run._r.xml and image_idx < len(images):
                    try:
                        cell.add_paragraph().add_run().add_picture(images[image_idx], width=Inches(2))
                        image_idx += 1
                    except Exception as e:
                        print(f"Error adding image: {e}")
        elif isinstance(block, Table):
            copy_table(block, cell)

folders_processed = 0
sections_human = 0
sections_gemini = 0
sections_union = 0
processed = 0
missing_in_human = []
missing_in_gemini = []
processed_sections = []
skipped_sections = []

for folder_name in os.listdir(input_base):
    input_dir = os.path.join(input_base, folder_name)
    gemini_dir = os.path.join(gemini_base, folder_name)
    if not os.path.isdir(input_dir):
        continue
    folders_processed += 1
    output_dir = os.path.join(output_base, folder_name)
    os.makedirs(output_dir, exist_ok=True)

    human_sections = set(
        f for f in os.listdir(input_dir)
        if f.startswith('section_') and f.endswith('.docx') and not f.endswith('_table.docx')
    )
    gemini_sections = set(
        f for f in os.listdir(gemini_dir)
        if f.startswith('section_') and f.endswith('.docx') and not f.endswith('_table.docx')
    ) if os.path.exists(gemini_dir) else set()

    sections_human += len(human_sections)
    sections_gemini += len(gemini_sections)

    # Union of all section files
    all_sections = sorted(human_sections | gemini_sections, key=lambda x: int(x.split('_')[1].split('.')[0]))
    sections_union += len(all_sections)

    for section_file in all_sections:
        section_path = os.path.join(input_dir, section_file)
        gemini_section_path = os.path.join(gemini_dir, section_file)
        image_dir_human = os.path.join(output_dir, "images_human")
        image_dir_gemini = os.path.join(output_dir, "images_gemini")
        os.makedirs(image_dir_human, exist_ok=True)
        os.makedirs(image_dir_gemini, exist_ok=True)
        try:
            table_doc = Document()
            table = table_doc.add_table(rows=2, cols=3)
            table.style = 'Table Grid'
            table.cell(0, 0).text = 'Human OCR'
            table.cell(0, 1).text = 'Gemini OCR'
            table.cell(0, 2).text = 'CER (Character Error Rate)'
            cell_human = table.cell(1, 0)
            cell_human._element.clear_content()
            if os.path.exists(section_path):
                add_content_to_cell(section_path, cell_human, image_dir_human)
            else:
                cell_human.add_paragraph('na')
                missing_in_human.append(f"{folder_name}/{section_file}")
            cell_gemini = table.cell(1, 1)
            cell_gemini._element.clear_content()
            if os.path.exists(gemini_section_path):
                add_content_to_cell(gemini_section_path, cell_gemini, image_dir_gemini)
            else:
                cell_gemini.add_paragraph('na')
                missing_in_gemini.append(f"{folder_name}/{section_file}")
            # CER calculation
            if not os.path.exists(section_path) or not os.path.exists(gemini_section_path):
                cer_value = 'na'
            else:
                human_text = normalize_text(extract_text_from_docx(section_path))
                gemini_text = normalize_text(extract_text_from_docx(gemini_section_path))
                if not human_text or not gemini_text or human_text == 'na' or gemini_text == 'na':
                    cer_value = 'na'
                else:
                    cer_value = cer(human_text, gemini_text)
                    table.cell(1, 2).text = f"{cer_value*100:.1f}%"
            if cer_value == 'na':
                table.cell(1, 2).text = 'na'
            table_filename = section_file.replace('.docx', '_table.docx')
            table_path = os.path.join(output_dir, table_filename)
            table_doc.save(table_path)
            print(f"Saved table for {folder_name}/{section_file} to {table_path}")
            processed += 1
            processed_sections.append(f"{folder_name}/{section_file}")
        except Exception as e:
            print(f"Error processing {folder_name}/{section_file}: {e}")
            skipped_sections.append(f"{folder_name}/{section_file}")

# Print summary
print("\n==== SUMMARY ====")
print(f"Total folders processed: {folders_processed}")
print(f"Total section files in Human OCR: {sections_human}")
print(f"Total section files in Gemini OCR: {sections_gemini}")
print(f"Total unique section files (union): {sections_union}")
print(f"Total section files processed (tables created): {processed}")
if missing_in_human:
    print(f"\nSection files missing in Human OCR:")
    for s in missing_in_human:
        print("  -", s)
if missing_in_gemini:
    print(f"\nSection files missing in Gemini OCR:")
    for s in missing_in_gemini:
        print("  -", s)
if skipped_sections:
    print(f"\nSection files that failed to process (error):")
    for s in skipped_sections:
        print("  -", s)
print("=================\n")


Saved table for 01_1002115268961841141690701450/section_1.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_1_table.docx
Saved table for 01_1002115268961841141690701450/section_2.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_2_table.docx
Saved table for 01_1002115268961841141690701450/section_3.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/section_3_table.docx
Saved table for 01_1002115268961841141690701450/section_4.docx to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables/01_1002115268961841141690701450/s

"""
Step	What it does
Normalize text	Lowercase, remove all whitespace
Compute distance	Levenshtein distance between the two strings
Divide	By length of reference (human_text)
Result	CER = (# edits) / (length of reference)
What does this mean?
CER (Character Error Rate) tells you the proportion of characters that are different between the two strings, relative to the reference.
A CER of 0 means the strings are identical.
A CER of 1 (or 100%) means every character is different, or the reference is empty and the hypothesis is not.
human_text is the reference (ground truth, e.g., Human OCR ou tput).
gemini_text is the hypothesis (e.g., Gemini OCR output).
Levenshtein distance is a metric for measuring the difference between two sequences (usually strings).
It is defined as the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one string into the other.
"""

## to make ocr in the table and get the cer

In [37]:
## ALL THE 3 COLUMNS ARE TAKEN FROM THE TABLE 

import os
import csv
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph

def get_all_docx_files(root_dir):
    docx_files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.docx'):
                docx_files.append(os.path.join(dirpath, filename))
    return docx_files

def get_cell_text_with_inner_tables(cell):
    """Extracts all text from a cell, including text from any tables inside the cell."""
    texts = []
    for item in cell._element:
        if item.tag.endswith('tbl'):
            table = Table(item, cell)
            for row in table.rows:
                row_text = [get_cell_text_with_inner_tables(c) for c in row.cells]
                texts.append(' | '.join(row_text))
        elif item.tag.endswith('p'):
            para = Paragraph(item, cell)
            texts.append(para.text)
    return '\n'.join(texts)

def extract_second_rows_to_csv(docx_files, csv_path, root_dir):
    header_written = False
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        for docx_file in docx_files:
            doc = Document(docx_file)
            for t_idx, table in enumerate(doc.tables):
                rows = table.rows
                if len(rows) < 2:
                    continue  # skip tables with no second row
                # Write header only once, from the first table processed
                if not header_written:
                    header = [cell.text.strip() for cell in rows[0].cells]
                    writer.writerow(['docx_file', 'table_index'] + header)
                    header_written = True
                # Write only the second row (index 1), extracting inner tables as text
                cells = [get_cell_text_with_inner_tables(cell).strip() for cell in rows[1].cells]
                # Pad/truncate to match header length
                cells = (cells + ['']*len(header))[:len(header)]
                writer.writerow([
                    os.path.relpath(docx_file, root_dir),
                    t_idx,
                    *cells
                ])

if __name__ == "__main__":
    root_dir = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables'
    csv_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_with_inner_tables.csv'

    all_docx_files = get_all_docx_files(root_dir)
    extract_second_rows_to_csv(all_docx_files, csv_path, root_dir)
    print(f"CSV saved to {csv_path}")

CSV saved to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_with_inner_tables.csv


In [38]:
## ONLY 2 COLUMNS ARE TAKEN FROM THE TABLE 
import os
import csv
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph

def get_all_docx_files(root_dir):
    docx_files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.docx'):
                docx_files.append(os.path.join(dirpath, filename))
    return docx_files

def get_cell_text_with_inner_tables(cell):
    """Extracts all text from a cell, including text from any tables inside the cell."""
    texts = []
    for item in cell._element:
        if item.tag.endswith('tbl'):
            table = Table(item, cell)
            for row in table.rows:
                row_text = [get_cell_text_with_inner_tables(c) for c in row.cells]
                texts.append(' | '.join(row_text))
        elif item.tag.endswith('p'):
            para = Paragraph(item, cell)
            texts.append(para.text)
    return '\n'.join(texts)

def extract_second_rows_to_csv(docx_files, csv_path, root_dir):
    header_written = False
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        for docx_file in docx_files:
            doc = Document(docx_file)
            for t_idx, table in enumerate(doc.tables):
                rows = table.rows
                if len(rows) < 2 or len(rows[1].cells) < 2:
                    continue  # skip tables with no second row or less than 2 columns
                # Write header only once, from the first table processed
                if not header_written:
                    header = [cell.text.strip() for cell in rows[0].cells[:2]]
                    writer.writerow(header)
                    header_written = True
                # Write only the first two cells of the second row (index 1), extracting inner tables as text
                cells = [get_cell_text_with_inner_tables(cell).strip() for cell in rows[1].cells[:2]]
                # Pad/truncate to 2 columns
                cells = (cells + ['']*2)[:2]
                writer.writerow(cells)

if __name__ == "__main__":
    root_dir = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/tables'
    csv_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_two_columns.csv'

    all_docx_files = get_all_docx_files(root_dir)
    extract_second_rows_to_csv(all_docx_files, csv_path, root_dir)
    print(f"CSV saved to {csv_path}")

CSV saved to /Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_two_columns.csv


"""
Feature	Code 1	Code 2	Code 3
Removes spaces	No	Yes	Yes (and more)
Removes all whitespace	No	No	Yes
Handles 'na' as missing	No	No	Yes
Skips CER for 'na' rows	No	No	Yes
Skips tabs/newlines in CER	No	No	Yes
"""

In [42]:
import pandas as pd
from difflib import ndiff

def char_error_rate(s1, s2):
    diff = list(ndiff(s1, s2))
    insertions = sum(1 for d in diff if d[0] == '+')
    deletions = sum(1 for d in diff if d[0] == '-')
    ref_len = len(s1)
    if ref_len == 0:
        return 0 if len(s2) == 0 else 1
    cer = (insertions + deletions) / ref_len
    return cer

def highlight_differences(s1, s2):
    diff = list(ndiff(s1, s2))
    result = []
    for d in diff:
        if d[0] == ' ':
            result.append(d[2])
        elif d[0] == '-':
            result.append(f"[-{d[2]}-]")
        elif d[0] == '+':
            result.append(f"[+{d[2]}+]")
    return ''.join(result)

# Load your CSV
csv_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_two_columns.csv'
csv_data = pd.read_csv(csv_path)

# Rename columns for clarity if needed
# If your columns are not named 'version1' and 'version2', rename them accordingly:
# csv_data.columns = ['version1', 'version2']

# If the first row is the header, and columns are e.g. 'Human OCR', 'Gemini OCR':
# print(csv_data.columns)  # Uncomment to check column names
# Suppose they are ['Human OCR', 'Gemini OCR']
csv_data['CER'] = csv_data.apply(lambda row: char_error_rate(str(row.iloc[0]), str(row.iloc[1])), axis=1)
csv_data['highlight_diff'] = csv_data.apply(lambda row: highlight_differences(str(row.iloc[0]), str(row.iloc[1])), axis=1)
overall_cer = csv_data['CER'].mean()

print("Overall CER:", overall_cer)
print(csv_data[['CER', 'highlight_diff']].head())

# Optionally, save the results
csv_data.to_csv(csv_path.replace('.csv', '_with_cer.csv'), index=False)

Overall CER: 0.560086063149898
        CER                                     highlight_diff
0  0.232558  [+A+][+n+][+s+][+ +]2.[-\t-][+ +]Largest arter...
1  0.051724  [+A+][+n+][+s+][+ +]3.[-\t-][+ +]The process o...
2  0.100000  [+A+][+n+][+s+][+ +]1.[-\t-][+ +]Left atrium r...
3  0.048673  [+A+][+n+][+s+][+ +]4.[-\t-][+ +]Functions of ...
4  0.857143               2[-.-][-\t-][+-+][+>+][+ +]Ao[+r+]ta


In [43]:
import pandas as pd
from difflib import ndiff

def char_error_rate(s1, s2):
    # Remove spaces from both strings
    s1 = s1.replace(' ', '')
    s2 = s2.replace(' ', '')
    
    diff = list(ndiff(s1, s2))
    insertions = sum(1 for d in diff if d[0] == '+')
    deletions = sum(1 for d in diff if d[0] == '-')
    ref_len = len(s1)
    
    if ref_len == 0:
        return 0 if len(s2) == 0 else 1
    
    cer = (insertions + deletions) / ref_len
    return cer

def highlight_differences(s1, s2):
    # Remove spaces from both strings for highlighting
    s1 = s1.replace(' ', '')
    s2 = s2.replace(' ', '')
    
    diff = list(ndiff(s1, s2))
    result = []
    for d in diff:
        if d[0] == ' ':
            result.append(d[2])
        elif d[0] == '-':
            result.append(f"[-{d[2]}-]")
        elif d[0] == '+':
            result.append(f"[+{d[2]}+]")
    return ''.join(result)

# Load your CSV
csv_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_two_columns.csv'
csv_data = pd.read_csv(csv_path)

# Rename columns for clarity if needed
# If your columns are not named 'version1' and 'version2', rename them accordingly:
# csv_data.columns = ['version1', 'version2']

# If the first row is the header, and columns are e.g. 'Human OCR', 'Gemini OCR':
# print(csv_data.columns)  # Uncomment to check column names
# Suppose they are ['Human OCR', 'Gemini OCR']
csv_data['CER'] = csv_data.apply(lambda row: char_error_rate(str(row.iloc[0]), str(row.iloc[1])), axis=1)
csv_data['highlight_diff'] = csv_data.apply(lambda row: highlight_differences(str(row.iloc[0]), str(row.iloc[1])), axis=1)
overall_cer = csv_data['CER'].mean()

print("Overall CER:", overall_cer)
print(csv_data[['CER', 'highlight_diff']].head())

# Optionally, save the results
csv_data.to_csv(csv_path.replace('.csv', '_with_cer_space.csv'), index=False)


Overall CER: 0.520081891192067
        CER                                     highlight_diff
0  0.216216  [+A+][+n+][+s+]2.[-\t-]Largestarteryinhumanbod...
1  0.046980  [+A+][+n+][+s+]3.[-\t-]Theprocessoftranspor[+t...
2  0.076923  [+A+][+n+][+s+]1.[-\t-]Leftatriumreceivestheox...
3  0.041885  [+A+][+n+][+s+]4.[-\t-]Functionsoflymphs:-\n(1...
4  0.714286                    2[-.-][-\t-][+-+][+>+]Ao[+r+]ta


In [44]:
import pandas as pd
from difflib import ndiff
import re
import numpy as np

def char_error_rate(s1, s2):
    # If either string is 'na' (case-insensitive), return np.nan
    if str(s1).strip().lower() == 'na' or str(s2).strip().lower() == 'na':
        return np.nan
    s1 = re.sub(r'\s+', '', s1)
    s2 = re.sub(r'\s+', '', s2)
    diff = list(ndiff(s1, s2))
    insertions = sum(1 for d in diff if d[0] == '+')
    deletions = sum(1 for d in diff if d[0] == '-')
    ref_len = len(s1)
    if ref_len == 0:
        return 0 if len(s2) == 0 else 1
    cer = (insertions + deletions) / ref_len
    return cer

def highlight_differences(s1, s2):
    # If either string is 'na', return 'na'
    if str(s1).strip().lower() == 'na' or str(s2).strip().lower() == 'na':
        return 'na'
    s1 = re.sub(r'\s+', '', s1)
    s2 = re.sub(r'\s+', '', s2)
    diff = list(ndiff(s1, s2))
    result = []
    for d in diff:
        if d[0] == ' ':
            result.append(d[2])
        elif d[0] == '-':
            result.append(f"[-{d[2]}-]")
        elif d[0] == '+':
            result.append(f"[+{d[2]}+]")
    return ''.join(result)

# Load your CSV
csv_path = '/Users/simrannaik/Desktop/solution_improvement/ds-prototypes/subjective_grading/solution_improvement/OCR_gd_gem/final_evaluation/all_tables_second_row_two_columns.csv'
csv_data = pd.read_csv(csv_path)

csv_data['CER'] = csv_data.apply(lambda row: char_error_rate(str(row.iloc[0]), str(row.iloc[1])), axis=1)
csv_data['highlight_diff'] = csv_data.apply(lambda row: highlight_differences(str(row.iloc[0]), str(row.iloc[1])), axis=1)
overall_cer = csv_data['CER'].mean(skipna=True)  # Only averages valid CERs

print("Overall CER:", overall_cer)
print(csv_data[['CER', 'highlight_diff']].head())

# Optionally, save the results
csv_data.to_csv(csv_path.replace('.csv', '_with_cer_all_clean.csv'), index=False)

Overall CER: 0.41075143695842703
        CER                                     highlight_diff
0  0.194444  [+A+][+n+][+s+]2.Largestarteryinhumanbodyis[-‘...
1  0.040541  [+A+][+n+][+s+]3.Theprocessoftranspor[+t+][+a+...
2  0.058824  [+A+][+n+][+s+]1.Leftatriumreceivestheoxygenat...
3  0.016393  [+A+][+n+][+s+]4.Functionsoflymphs:-(1)Ithelps...
4  0.666667                          2[-.-][+-+][+>+]Ao[+r+]ta
