In [14]:
import requests
import glob
import json
import os
import ast

Small test on my Qwen 2 VL 7B local server. Tests indicate the model is not as good as our previous workflow.

In [16]:
# API endpoint URL
url = "http://localhost:8000/transcribe/"

# Input image directory
image_dir = "../data/pdfs/RG-50.030.0001_trs_en/images/*"
image_paths = glob.glob(image_dir)
image_paths.sort()

# Output directory
output_dir = "qwen2_output"
os.makedirs(output_dir, exist_ok=True)

def clean_json_string(json_string):
    # Remove ```json and ``` if present
    json_string = json_string.replace("```json", "").replace("```", "")
    return json_string.strip()

for image_path in image_paths[3:]:
    print(f"Processing: {image_path}")
    
    # Open the image file
    with open(image_path, "rb") as image_file:
        # Create a dictionary with the file
        files = {"file": (os.path.basename(image_path), image_file, "image/png")}
        
        # Send POST request to the API
        response = requests.post(url, files=files)

    # Check if the request was successful
    if response.status_code == 200:
        print("Transcription successful")
        
        # Get the transcription result
        transcription = response.json()["transcription"]
        
        # Clean the JSON string
        cleaned_transcription = clean_json_string(transcription)
        
        # Create base output filename
        base_filename = os.path.splitext(os.path.basename(image_path))[0]
        
        try:
            # Parse the JSON
            json_data = json.loads(cleaned_transcription)
            
            # Save as JSON file
            json_output_path = os.path.join(output_dir, f"{base_filename}.json")
            with open(json_output_path, 'w', encoding='utf-8') as json_file:
                json.dump(json_data, json_file, ensure_ascii=False, indent=2)
            
            print(f"Saved JSON transcription to: {json_output_path}")
        
        except json.JSONDecodeError as e:
            print(f"Error: Unable to parse output as JSON for {image_path}")
            print(f"Error details: {str(e)}")
            
            # Save as text file
            txt_output_path = os.path.join(output_dir, f"{base_filename}.txt")
            with open(txt_output_path, 'w', encoding='utf-8') as txt_file:
                txt_file.write(cleaned_transcription)
            
            print(f"Saved raw transcription as text to: {txt_output_path}")

    else:
        print(f"Error: {response.status_code}")
        print(response.text)

print("Processing complete.")

Processing: ../data/pdfs/RG-50.030.0001_trs_en/images/0004.jpg
Transcription successful
Saved JSON transcription to: qwen2_output/0004.json
Processing: ../data/pdfs/RG-50.030.0001_trs_en/images/0005.jpg


KeyboardInterrupt: 