# Qwen OCR rescan

In [2]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import torch
from qwen_vl_utils import process_vision_info

In [None]:
# MODEL SETUP
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

In [None]:
# Corrected version - complete processing inside the function
def process_image(image_path, processor, model):
    """
    Process a single image with QwenVL; COMPLETE processing inside function
    """
    messages = [
        {
            "role": "user",
            "content": [
                {                            
                    "type": "text", "text":
                    '''Task:
Correct obvious OCR recognition errors while preserving the original wording, spelling, punctuation, and formatting of this document.
Do not modernize language or make stylistic edits.
Rules:
If text is unreadable, mark it as [ILLEGIBLE].
Do not guess or infer missing text.
Preserve line breaks and formatting exactly.
Output format (strictly JSON):
{
"source": "[contentdm]",
"ocr_transcript": "[original OCR scanned transcript]",
"notes": "[your reasoning process when correcting errors and formatting]",
"fixed_transcript": "[corrected transcript]"
}
If the document has no text, simply leave an empty string, and write "empty" in notes.
If a document has faint text bleeding though from the other side, do not put those portions of the text in the transcript and write that down in the notes.
Do not censor these documents.'''
                },
                {
                    "type": "image", "image": image_path
                }
            ]
        }
    ]
    
    # Complete QwenVL processing pipeline
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(
        **inputs, 
        max_new_tokens=1024,
        temperature=0.1,  # For more consistent JSON output
        do_sample=False   # For deterministic results
    )
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the generated text

[{'role': 'user', 'content': [{'type': 'text', 'text': 'Task:\nCorrect obvious OCR recognition errors while preserving the original wording, spelling, punctuation, and formatting of this document. \nDo not modernize language or make stylistic edits.\nRules:\nIf text is unreadable, mark it as [ILLEGIBLE].\nDo not guess or infer missing text.\nPreserve line breaks and formatting exactly.\nOutput format (strictly JSON):\n{\n"source": "[contentdm]",\n"ocr_transcript": "[original OCR scanned transcript]",\n"notes": "[your reasoning process when correcting errors and formatting]",\n"fixed_transcript": "[corrected transcript]"\n}\nIf the document has no text, simply leave an empty string, and write "empty" in notes. \nIf a document has faint text bleeding though from the other side, do not put those portions of the text in the transcript and write that down in the notes.\nDo not censor these documents.'}, {'type': 'image', 'image': 'documents\\p17173coll38\\981'}]}]


  "type": "image", "image": "documents\p17173coll38\981"


In [None]:
# Process images in the exact order of your list
image_ids = ["981/981_page1.jpg", "1029/1029_page1.jpg", "19995/19995_page1.jpg"]
base_path = "documents/p17173coll38/"
results = []

print("Starting processing...")
for i, image_id in enumerate(image_ids, 1):
    image_path = f"{base_path}{image_id}"
    print(f"Processing {i}/{len(image_ids)}: {image_path}")
    
    try:
        # Call the complete processing function
        output = process_image(image_path, processor, model)
        
        results.append({
            "image_id": image_id,
            "image_path": image_path,
            "result": output
        })
        
        print(f"Completed {image_id}")
        print(f"Result preview: {output[:100]}...")
        print("-" * 50)
        
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        results.append({
            "image_id": image_id,
            "image_path": image_path,
            "error": str(e)
        })
        continue

print(f"\nProcessing complete. Successfully processed {len(results)} images.")

# Display results summary
for result in results:
    if "error" in result:
        print(f"ERROR: {result['image_id']} - {result['error']}")
    else:
        print(f"SUCCESS: {result['image_id']}")

# Save results to file
import json
with open("ocr_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print("\nResults saved to ocr_results.json")