In [None]:
import json
import base64
import openai
import os
from PIL import Image
import io

# --- Configuration ---
# Replace with your actual OpenAI API key, organization ID, and project ID
# It's recommended to load these from environment variables for security
# Example: api_key = os.environ.get("OPENAI_API_KEY")
api_key = ""
 # Replace with your key
organization_id = "" # Replace with your organization ID
project_id = "" # Replace with your project ID if you have one, otherwise keep it empty

# Paths to your dataset files
test_jsonl = 'fintabnet/FinTabNet_1.0.0_table_test.jsonl'
# IMPORTANT: The pdf_folder should point to the directory containing the images
# extracted from the PDFs in the FinTabNet dataset.
# You will need to extract the images from the PDF files yourself or obtain them
# in image format. FinTabNet primarily provides annotations for PDF pages.
# For this example, we assume you have a folder 'fintabnet/images' with images
# corresponding to the filenames in the JSONL.
image_folder = 'fintabnet/pdf' # Replace with the actual path to your image files

# --- OpenAI Client Initialization ---
client = OpenAI(
    api_key=api_key,
    organization=organization_id,
    project=project_id
)

# --- Data Loading and Parsing Function ---
def parse_fintabnet_jsonl(jsonl_path, image_base_folder):
    """
    Parses the FinTabNet JSONL annotation file and prepares image data.
    Note: This function assumes image files exist corresponding to the filenames
    in the JSONL, located within the image_base_folder.
    """
    images_data = {}
    try:
        with open(jsonl_path, 'r', encoding='utf-8') as fp:
            for line in fp:
                sample = json.loads(line)
                filename = sample['filename'] # This is usually the PDF filename + page number

                # Construct the expected image filename (you might need to adapt this
                # based on how you name your extracted images)
                # A common format might be 'filename_page.png' or similar.
                # For simplicity here, we'll assume the filename in JSONL
                # directly corresponds to an image file name (e.g., 'report_page_1.png')
                # You might need to adjust the line below:
                image_filename = os.path.splitext(filename)[0] + '.png' # Assuming images are PNG, adjust if needed

                image_filepath = os.path.join(image_base_folder, image_filename)

                # Check if the image file actually exists
                if not os.path.exists(image_filepath):
                    print(f"Warning: Image file not found for {filename} at {image_filepath}. Skipping.")
                    continue

                if filename not in images_data:
                     images_data[filename] = {
                        "annotations": [],
                        "html": "",
                        'filepath': image_filepath # Store the actual image file path
                     }


                # Parse cell bounding boxes
                # Note: The original JSONL has bbox per token within cells.
                # This code aggregates cell tokens but the bbox parsing part needs
                # to be carefully aligned with what you want to use for evaluation.
                # If you need bounding boxes per cell for evaluation metrics,
                # you would process sample["html"]["cells"] differently.
                # The original code added token bboxes as 'cell' category which might be confusing.
                # Let's refine to get cell bboxes if available at the cell level or table bbox.
                # FinTabNet structure has 'bbox' at the top level (for the whole table)
                # and 'bbox' within 'cells' list for each cell's bounding box.

                cell_annotations = []
                for cell_data in sample["html"]["cells"]:
                     if "bbox" in cell_data:
                         cell_annotations.append({
                             "category_id": 1, # cell
                             "bbox": cell_data["bbox"],
                             "tokens": cell_data.get("tokens", []) # Store cell tokens if needed
                         })
                images_data[filename]["annotations"].extend(cell_annotations)

                # Reconstruct HTML table structure (tokens only, content added below)
                html_structure = "".join(sample["html"]["structure"]["tokens"])
                # You might want to reconstruct the full HTML with cell content for reference
                # but the original code's reconstruction seems basic.
                # A more robust HTML reconstruction would involve pairing structure tokens
                # with cell content based on the <td> tags.

                # For this pipeline, we primarily need the image and potentially ground truth HTML/cells for evaluation later.
                # Let's store the raw cell data and table bbox for potential evaluation use.
                images_data[filename]["ground_truth"] = {
                    "html_structure_tokens": sample["html"]["structure"]["tokens"],
                    "cells": sample["html"]["cells"], # Raw cell data including tokens and bboxes
                    "table_bbox": sample.get("bbox") # Whole table bbox
                }


        return images_data
    except FileNotFoundError:
        print(f"Error: JSONL file not found at {jsonl_path}")
        return {}
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {jsonl_path}")
        return {}


# --- Evaluation Pipeline ---
def evaluate_table_extraction(images_data):
    """
    Runs the evaluation pipeline for a set of images.
    """
    results = {}

    for filename, data in images_data.items():
        image_path = data['filepath']
        ground_truth = data['ground_truth']

        print(f"\nProcessing {filename}...")

        # 1. Load and encode image
        try:
            with open(image_path, "rb") as image_file:
                base64_image = base64.b64encode(image_file.read()).decode("utf-8")
        except FileNotFoundError:
            print(f"Error: Image file not found at {image_path}. Skipping {filename}.")
            results[filename] = {"status": "Image not found"}
            continue
        except Exception as e:
            print(f"Error encoding image {image_path}: {e}. Skipping {filename}.")
            results[filename] = {"status": "Image encoding failed"}
            continue

        # 2. Send to ChatGPT-4o for extraction
        try:
            print(f"Sending {filename} to GPT-4o...")
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": (
                                    "Extract the table from this image and output only plain CSV text — "
                                    "no Markdown formatting, no triple backticks, no language tags, no explanations, no extra text.\n"
                                    "Convert checkmarks (✓) and crosses (✗) into '1' and '0'.\n"
                                    "Flatten multi-level headers into a single header row by combining levels, using a space or underscore if needed.\n"
                                    "If there are grouped sections or repeated row headers, repeat the values to ensure each row is complete.\n"
                                    "The result must be pure CSV text, machine-readable, and directly usable for SQL ingestion.\n"
                                    "Return only the CSV content itself, nothing else."
                                )
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=4000 # Increased max_tokens for potentially large tables
            )

            gpt4o_output_text = response.choices[0].message.content.strip()
            print("Received response from GPT-4o.")
            # print("--- GPT-4o Raw Output ---")
            # print(response.choices[0].message.content)
            # print("-----------------------")

            results[filename] = {
                "status": "Success",
                "gpt4o_output_csv": gpt4o_output_text,
                "ground_truth": ground_truth # Store ground truth for later evaluation
            }

            # 3. Evaluation (Placeholder)
            # This is where you would add your evaluation logic.
            # You would compare `gpt4o_output_csv` with the `ground_truth`.
            # Common metrics for table extraction include:
            # - Cell-level F1 score (comparing predicted cell bounding boxes/content to ground truth)
            # - Structure accuracy (comparing predicted HTML structure to ground truth)
            # - Content accuracy (comparing text content in cells)
            # Implementing these metrics requires more complex parsing of both
            # the predicted CSV and the ground truth FinTabNet structure/cells.
            print("Evaluation step placeholder. Implement your evaluation logic here.")
            # Example: Basic print of extracted CSV
            print("--- Extracted CSV ---")
            print(gpt4o_output_text)
            print("---------------------")


        except openai.APIError as e:
             print(f"OpenAI API error processing {filename}: {e}")
             results[filename] = {"status": "API Error", "error": str(e)}
        except Exception as e:
            print(f"An unexpected error occurred processing {filename}: {e}")
            results[filename] = {"status": "Error", "error": str(e)}

    return results

# --- Main Execution ---
if __name__ == "__main__":
    print(f"Loading test dataset from {test_jsonl}...")
    # Load data for evaluation. You might want to limit this to a small number
    # of images for a mini-evaluation due to API costs and time.
    # Example: Load only the first 10 images
    # images_to_evaluate = dict(list(parse_fintabnet_jsonl(test_jsonl, image_folder).items())[:10])
    images_to_evaluate = parse_fintabnet_jsonl(test_jsonl, image_folder)


    if not images_to_evaluate:
        print("No image data loaded. Please check paths and file existence.")
    else:
        print(f"Loaded data for {len(images_to_evaluate)} images.")
        evaluation_results = evaluate_table_extraction(images_to_evaluate)

        print("\n--- Evaluation Summary ---")
        success_count = sum(1 for r in evaluation_results.values() if r.get("status") == "Success")
        error_count = len(evaluation_results) - success_count
        print(f"Total images processed: {len(images_to_evaluate)}")
        print(f"Successful extractions: {success_count}")
        print(f"Failed extractions: {error_count}")

        # You can further process `evaluation_results` to calculate metrics
        # based on the stored `gpt4o_output_csv` and `ground_truth`.

In [None]:
import os
import json
import base64
from difflib import SequenceMatcher
from openai import OpenAI
from tqdm import tqdm

# Setup OpenAI client
client = OpenAI(
    api_key = "",
    organization="org-SSjyvjIWFJ99m2is8K33pv0V",
)

# Directories
annotations_dir = "TableBank/Detection/annotations"
images_dir = "TableBank/Detection/images"

# Annotation files
annotation_files = [
    "tablebank_latex_train.json",
    "tablebank_latex_val.json",
    "tablebank_word_test.json",
    "tablebank_word_train.json",
    "tablebank_word_val.json",
    "tablebank_latex_test.json"
]

# Evaluation results
results = []

# GPT-4o prompt template
def build_prompt():
    return (
        "Extract the table from this image and output only plain CSV text — "
        "no Markdown formatting, no triple backticks, no language tags, no explanations, no extra text.\n"
        "Convert checkmarks (✓) and crosses (✗) into '1' and '0'.\n"
        "Flatten multi-level headers into a single header row.\n"
        "Repeat headers for each row if needed.\n"
        "The result must be pure CSV text, machine-readable, and directly usable for SQL ingestion.\n"
        "Return only the CSV content itself, nothing else."
    )

# Utility: Levenshtein similarity ratio
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Evaluation loop
for ann_file in annotation_files:
    ann_path = os.path.join(annotations_dir, ann_file)

    with open(ann_path, "r", encoding="utf-8") as f:
        annotations = json.load(f)

        # Load proper keys
    images = annotations["images"]
    # Optionally: annotations = annotations["annotations"]  # if you want bbox info

    print(f"🔍 Processing {ann_file} with {len(images)} image entries")

    for sample in tqdm(images[:25]):  # Now we're slicing a list ✅
        filename = sample["file_name"]
        # Skip ground-truth HTML for now unless you pull it from somewhere else
        gt_html = ""  # Placeholder — unless you’ve aligned this with HTML annotations

        image_path = os.path.join(images_dir, filename)
        if not os.path.exists(image_path):
            print(f"⚠️ Skipping missing image: {filename}")
            continue

        # Encode image
        with open(image_path, "rb") as img_file:
            b64_image = base64.b64encode(img_file.read()).decode("utf-8")

        try:
            # GPT-4o Vision call
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": build_prompt()},
                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"}}
                        ]
                    }
                ],
                max_tokens=2000
            )
            gpt_output = response.choices[0].message.content.strip()

            # Since we don’t have real GT HTML in this example, set similarity score = None
            score = None

            results.append({
                "filename": filename,
                "score": score,
                "gpt_output": gpt_output,
                "ground_truth": gt_html
            })

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            continue


🔍 Processing tablebank_latex_train.json with 187199 image entries


100%|██████████| 25/25 [05:03<00:00, 12.15s/it]


🔍 Processing tablebank_latex_val.json with 7265 image entries


 20%|██        | 5/25 [00:49<03:19,  9.99s/it]


KeyboardInterrupt: 