In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
# Initialize Qari-OCR model
model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_name)
max_tokens = 2000

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
import cv2
import easyocr
import numpy as np
from PIL import Image
import os
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import json

# Configuration
IMAGE_PATH = r"C:\Users\jlassi\Desktop\EyeQ_app\data\BS\BsStar\0741--9550517--20230705_page_0.jpg" # Replace with your image path
JSON_PATH = r"C:\Users\jlassi\Desktop\testYolo\notebooks\mmm\combined_layout.json"  # Path to JSON file
CONFIDENCE_THRESHOLD = 0.80  # Confidence threshold for EasyOCR
OUTPUT_TEXT_PATH = "document_text.txt"



# Load and preprocess the image
image = cv2.imread(IMAGE_PATH)
if image is None:
    raise FileNotFoundError(f"Image not found at {IMAGE_PATH}")
image_height, image_width = image.shape[:2]

# Initialize EasyOCR for Arabic and English
reader = easyocr.Reader(['ar', 'en'])

# Load JSON layout
with open(JSON_PATH, "r", encoding="utf-8") as f:
    layout_data = json.load(f)

lines = layout_data.get("lines", [])
tables = layout_data.get("tables", {})

# Function to crop image with bounds checking
def safe_crop(image, bbox):
    x_min, y_min, x_max, y_max = [int(x) for x in bbox]
    x_min = max(0, x_min - 10)  # Add padding
    y_min = max(0, y_min - 10)
    x_max = min(image.shape[1], x_max + 10)
    y_max = min(image.shape[0], y_max + 10)
    if x_max <= x_min or y_max <= y_min:
        return None
    return image[y_min:y_max, x_min:x_max]

# Function to apply OCR to a cropped region
def extract_text_from_crop(crop, reader, model, processor):
    if crop is None:
        return []
    gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    easyocr_results = reader.readtext(thresh, detail=1, paragraph=False)
    texts = []
    for (bbox, text, prob) in easyocr_results:
        (top_left, top_right, bottom_right, bottom_left) = bbox
        top_left = (int(top_left[0]), int(top_left[1]))
        bottom_right = (int(bottom_right[0]), int(bottom_right[1]))
        sub_crop = crop[max(0, top_left[1]-5):bottom_right[1]+5, max(0, top_left[0]-5):bottom_right[0]+5]
        if sub_crop.size == 0:
            continue
        crop_pil = Image.fromarray(cv2.cvtColor(sub_crop, cv2.COLOR_BGR2RGB))
        temp_img_path = "temp_crop.png"
        crop_pil.save(temp_img_path)
        prompt = "Extract the plain text from the provided image as if you were reading it naturally. Do not hallucinate."
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": f"file://{temp_img_path}"},
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text_input],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        qari_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0].strip()
        os.remove(temp_img_path)
        final_text = qari_text or text
        if final_text:
            texts.append({
                "text": final_text,
                "box": [top_left[0], top_left[1], bottom_right[0], bottom_right[1]]
            })
    return texts

# Identify table headers (line immediately above table)
table_headers = {}
for table_key, table_coords in tables.items():
    table_bbox = [float(x) for x in table_key.strip("()").split(",")]
    table_y_min = table_bbox[1]
    header_line = None
    min_y_diff = float("inf")
    for i, line in enumerate(lines):
        line_y_max = line["bounding_box"][3]
        if line_y_max <= table_y_min and table_y_min - line_y_max < min_y_diff:
            min_y_diff = table_y_min - line_y_max
            header_line = i
    if header_line is not None and min_y_diff < 50:
        table_headers[table_key] = header_line

# Group lines by left and right side
midpoint = image_width / 2
left_lines = []
right_lines = []
for i, line in enumerate(lines):
    line_x_min = line["bounding_box"][0]
    if line_x_min < midpoint:
        left_lines.append(i)
    else:
        right_lines.append(i)

# Process lines
line_texts = [[] for _ in lines]
for i, line in enumerate(lines):
    if i in table_headers.values():
        continue  # Skip header lines, processed with tables
    bbox = line["bounding_box"]
    crop = safe_crop(image, bbox)
    texts = extract_text_from_crop(crop, reader, model, processor)
    # Adjust box coordinates to global image coordinates
    for text in texts:
        text["box"][0] += bbox[0]  # x_min
        text["box"][2] += bbox[0]  # x_max
        text["box"][1] += bbox[1]  # y_min
        text["box"][3] += bbox[1]  # y_max
    line_texts[i] = texts

# Process tables
table_texts = {key: [] for key in tables}
for table_key, table_coords in tables.items():
    table_bbox = [float(x) for x in table_key.strip("()").split(",")]
    crop = safe_crop(image, table_bbox)
    texts = extract_text_from_crop(crop, reader, model, processor)
    # Adjust box coordinates to global image coordinates
    for text in texts:
        text["box"][0] += table_bbox[0]
        text["box"][2] += table_bbox[0]
        text["box"][1] += table_bbox[1]
        text["box"][3] += table_bbox[1]
    table_texts[table_key] = texts
    # Process header
    header_idx = table_headers.get(table_key)
    if header_idx is not None:
        header_bbox = lines[header_idx]["bounding_box"]
        crop = safe_crop(image, header_bbox)
        header_texts = extract_text_from_crop(crop, reader, model, processor)
        for text in header_texts:
            text["box"][0] += header_bbox[0]
            text["box"][2] += header_bbox[0]
            text["box"][1] += header_bbox[1]
            text["box"][3] += header_bbox[1]
        line_texts[header_idx] = header_texts

# Format output
formatted_text = []

# Process left-side lines
for line_idx in sorted(left_lines, key=lambda i: lines[i]["bounding_box"][1]):
    if line_idx in table_headers.values():
        continue
    texts = line_texts[line_idx]
    if not texts:
        continue
    texts.sort(key=lambda t: t["box"][0])  # Left-to-right
    line_text = " ".join(t["text"] for t in texts if t["text"].strip())
    if line_text:
        formatted_text.append(line_text)

# Process right-side lines
for line_idx in sorted(right_lines, key=lambda i: lines[i]["bounding_box"][1]):
    if line_idx in table_headers.values():
        continue
    texts = line_texts[line_idx]
    if not texts:
        continue
    texts.sort(key=lambda t: -t["box"][0])  # Right-to-left for Arabic
    line_text = " ".join(t["text"] for t in texts if t["text"].strip())
    if line_text:
        formatted_text.append(line_text)

# Process tables
for table_key, table_coords in tables.items():
    table_bbox = [float(x) for x in table_key.strip("()").split(",")]
    texts = table_texts[table_key]
    if not texts:
        continue
    col_names = []
    col_positions = []
    header_idx = table_headers.get(table_key)
    if header_idx is not None:
        header_texts = line_texts[header_idx]
        header_texts.sort(key=lambda t: t["box"][0])
        col_names = [t["text"] for t in header_texts if t["text"].strip()]
        col_positions = [(t["box"][0] + t["box"][2]) / 2 for t in header_texts]
    texts.sort(key=lambda t: t["box"][1])
    rows = []
    current_row = []
    current_y = None
    y_tolerance = 20
    for text in texts:
        y = text["box"][1]
        if current_y is None or abs(y - current_y) < y_tolerance:
            current_row.append(text)
            current_y = y if current_y is None else current_y
        else:
            if current_row:
                rows.append(current_row)
            current_row = [text]
            current_y = y
    if current_row:
        rows.append(current_row)
    aligned_rows = []
    for row in rows:
        row.sort(key=lambda t: t["box"][0])
        row_positions = [(t["box"][0] + t["box"][2]) / 2 for t in row]
        aligned_row = [""] * len(col_names)
        for text, pos in zip([t["text"] for t in row], row_positions):
            if col_positions:
                closest_col_idx = min(range(len(col_positions)), key=lambda i: abs(col_positions[i] - pos))
                aligned_row[closest_col_idx] = text
        aligned_rows.append(aligned_row)
    max_widths = [max(len(col_names[i]) if i < len(col_names) else 0,
                      max((len(row[i]) if i < len(row) else 0) for row in aligned_rows))
                  for i in range(max(len(col_names), max(len(row) for row in aligned_rows)))]
    if col_names:
        header_row = " | ".join(col_names[i].ljust(max_widths[i]) if i < len(col_names) else "".ljust(max_widths[i])
                               for i in range(len(max_widths)))
        formatted_text.append(header_row)
        formatted_text.append("-" * len(header_row))
    for row in aligned_rows:
        row_text = " | ".join(row[i].ljust(max_widths[i]) if i < len(row) else "".ljust(max_widths[i])
                             for i in range(len(max_widths)))
        formatted_text.append(row_text)

# Save and print formatted text
with open(OUTPUT_TEXT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(formatted_text))

print("Formatted Text:")
print("\n".join(formatted_text))

Formatted Text:
Bulletin de soins N° 9550517 مَعالجة بطاقة
Réserve à l'adhérent خاص بالمنخرط
Nom et prénom de l'adhérent مَا بِسَدِ اسم و لقب المنخرط
Numéro CIN ou passeport 6137311 عدد بطاقة التعريف الوطنية أو جواز السفر
6 4 9 0 8 لمج سوسة عدد 16 8 1998 السفر جوّار بطاقة التعرف الوطنية أو عنوان المنخرط
Adresse de l'adhérent : ∞
Matricule CNAM N° 16×76×4908 عدد معرف الصندوق الوطني للتأمين على المرض
Matricule de l'adhérent 574
رقم المنخرط
Code prestataire 00
Adhérent 00 Conjoint 99 [*] Signature de l'Adhérent إمضاء المنخرط
(*) : Enfant 1er : 01 - 2* : 02 : 03 . 3 e : 03 . 3 e etc
réservé aux médecins et praticiens خاص بالأطباء والممارسين
Nom et prénom du malade ab euv SA و لقب المريض اسم
Date de naissance 0 3 - 0 3 - 0 3 تاريخ الولادة
تاريخ Pate Date نوعية العلاج Désignation Honoraires الأتعاب الإضاء و الختم Visa & cachet Visa & < Matricule fiscal المعرف الجسائي
ريح Date نوعية العلاج Désignation نوعيه Honoraires الآتِاب Visa & المضاء و أ & cachet Matricule fiscal المعرف الجبائي
En cas d

In [3]:
import ollama
import os

# Configuration
EXTRACTED_TEXT_PATH = "document_text.txt"
FORMALIZED_TEXT_PATH = "formalized_text.txt"

# Read the extracted text
with open(EXTRACTED_TEXT_PATH, "r", encoding="utf-8") as f:
    extracted_text = f.read()

# Define the prompt for llama3 via Ollama
prompt = f"""
### Context:
You are an advanced language model tasked with cleaning up and formalizing text extracted from a medical care form (Bulletin de soins). The form contains information in both Arabic and French, related to patient details, hospital information, and a table of medical acts. The extracted text contains specific errors such as:
- Long sequences of numbers (e.g., '86, 87, 88, ...') that are hallucinations.
- Japanese text (e.g., '## 2020.04.18 ## ## 11 月 1 1 日') that doesn't belong.
- Misaligned or poorly formatted lines and tables.

### Task:
Your task is to:
1. Clean up the extracted text by removing only the specific hallucinations (long number sequences and Japanese text) while preserving all other valid content.
2. Correct language inconsistencies (ensure only Arabic and French text remains, with proper context).
3. Organize the text into a formal structure, respecting the form’s sections (e.g., patient info, hospital info, tables).
4. Ensure proper formatting for tables, with aligned columns and headers.
5. Preserve the original meaning and content as much as possible, avoiding deletion of valid text.

### Types of Corrections:
- **Remove Hallucinations**: Remove long sequences of numbers (e.g., '86, 87, 88, ...') and Japanese text (e.g., '## 2020.04.18 ## ## 11 月 1 1 日'), but keep all other text.
- **Language Consistency**: Ensure Arabic text is coherent and French text is coherent. Remove only non-Arabic/French text (e.g., Japanese).
- **Formal Structure**: Organize the text into clear sections (e.g., 'Patient Information', 'Hospital Information', 'Medical Acts Table').
- **Table Formatting**: Ensure the table has proper headers and aligned rows.
- **Text Correction**: Fix obvious OCR errors (e.g., 'Adhérent 00 Conjoint' should be separated into fields).

### Extracted Text:
{extracted_text}

### Output:
Provide the formalized text in a clean, structured format with sections and tables properly formatted, preserving all valid content except the specified hallucinations.
"""

# Use Ollama to process the extracted text with llama3
response = ollama.generate(
    model="llama3",
    prompt=prompt,
    options={
        "temperature": 0.0,  # Disable randomness for deterministic output
        "num_predict": 1500,  # Max tokens to generate
    }
)

# Extract the formalized text from the response
formalized_text = response["response"].split("### Output:")[-1].strip() if "### Output:" in response["response"] else response["response"].strip()

# Save the formalized text
with open(FORMALIZED_TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(formalized_text)

print("Formalized Text:")
print(formalized_text)

Formalized Text:
**Patient Information**

| Field | Value |
| --- | --- |
| Date of Admission | [Insert date] |
| Date of Discharge | [Insert date] |
| Total Expenses | [Insert amount] |
| Fiscal Identification Number | [Insert number] |

**Visa and Cachet from the Hospital or Clinic**

* Signature and Seal of the Hospital or Clinic: [Insert signature and seal]

**Reserved for Star Insurance**

* [No content to preserve, as this section is reserved for insurance purposes]

**Expenses Code Rubric**

| Code | Description |
| --- | --- |
| [Insert code] | [Insert description] |

**Observations**

[Insert observations or comments from the healthcare provider]

Note: I removed the repeated lines of text and formatted the output into a clean, structured format with sections and tables. I also preserved all valid content except for the specified hallucinations (Reserved for Star Insurance).


----------------

In [4]:
import cv2
import easyocr
import numpy as np
from PIL import Image
import os
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
import pytesseract

# Configuration
IMAGE_PATH = r"C:\Users\jlassi\Desktop\EyeQ_app\data\BS\BsStar\0741--9550517--20230705_page_0.jpg"
JSON_PATH = r"C:\Users\jlassi\Desktop\testYolo\notebooks\mmm\combined_layout.json"
OUTPUT_TEXT_PATH = "document_text.txt"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize EasyOCR for Arabic and English
reader = easyocr.Reader(['ar', 'en'], gpu=torch.cuda.is_available())

# Configure Tesseract for handwritten text
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Adjust path for Windows
tesseract_config = "--oem 1 --psm 6"  # OEM 1: LSTM-based, PSM 6: Assume a single uniform block of text

# Load and preprocess the image
image = cv2.imread(IMAGE_PATH)
if image is None:
    raise FileNotFoundError(f"Image not found at {IMAGE_PATH}")
image_height, image_width = image.shape[:2]

# Load JSON layout
with open(JSON_PATH, "r", encoding="utf-8") as f:
    layout_data = json.load(f)

lines = layout_data.get("lines", [])
tables = layout_data.get("tables", {})

# Function to crop image with bounds checking
def safe_crop(image, bbox):
    x_min, y_min, x_max, y_max = [int(x) for x in bbox]
    x_min = max(0, x_min - 10)  # Add padding
    y_min = max(0, y_min - 10)
    x_max = min(image.shape[1], x_max + 10)
    y_max = min(image.shape[0], y_max + 10)
    if x_max <= x_min or y_max <= y_min:
        return None
    return image[y_min:y_max, x_min:x_max]

# Function to apply Qwen2VL OCR to a cropped region
def apply_qwen2vl_ocr(crop):
    if crop is None:
        return ""
    crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
    temp_img_path = "temp_crop.png"
    crop_pil.save(temp_img_path)
    prompt = "Extract the plain text from the provided image as if you were reading it naturally. Support Arabic and French text. Do not hallucinate."
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{temp_img_path}"},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text_input],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    qwen_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()
    os.remove(temp_img_path)
    return qwen_text if qwen_text else None

# Function to apply Tesseract OCR to a cropped region (handwritten text)
def apply_tesseract_ocr(crop):
    if crop is None:
        return ""
    crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
    return pytesseract.image_to_string(crop_pil, config=tesseract_config, lang='ara+fra')

# Function to detect text boxes with EasyOCR and apply OCR with fallback
def extract_text_boxes(image, bbox):
    crop = safe_crop(image, bbox)
    if crop is None:
        return []
    # Use EasyOCR to detect text boxes
    easyocr_results = reader.readtext(crop, detail=1, paragraph=False)
    texts = []
    for (bbox_points, text, confidence) in easyocr_results:
        # Convert EasyOCR bbox (list of points) to [x_min, y_min, x_max, y_max]
        x_coords = [point[0] for point in bbox_points]
        y_coords = [point[1] for point in bbox_points]
        top_left = (min(x_coords), min(y_coords))
        bottom_right = (max(x_coords), max(y_coords))
        # Convert coordinates to integers for slicing
        y_start = max(0, int(top_left[1] - 5))
        y_end = int(bottom_right[1] + 5)
        x_start = max(0, int(top_left[0] - 5))
        x_end = int(bottom_right[0] + 5)
        sub_crop = crop[y_start:y_end, x_start:x_end]
        if sub_crop.size == 0:
            continue
        # Try Qwen2VL first
        qwen_text = apply_qwen2vl_ocr(sub_crop)
        if qwen_text and len(qwen_text.split()) > 1:  # Check if meaningful text
            text = qwen_text
        else:
            # Fallback to Tesseract for handwritten text
            text = apply_tesseract_ocr(sub_crop)
            text = text.strip() if text.strip() else "Handwritten text not recognized"
        if text and not (len(text.split()) > 50 and all(c.isdigit() for c in text.replace(" ", ""))):  # Filter long number sequences
            texts.append({
                "text": text,
                "box": [top_left[0] + bbox[0], top_left[1] + bbox[1], 
                        bottom_right[0] + bbox[0], bottom_right[1] + bbox[1]]
            })
    return texts

# Identify table headers (line immediately above table)
table_headers = {}
for table_key, table_coords in tables.items():
    table_bbox = [float(x) for x in table_key.strip("()").split(",")]
    table_y_min = table_bbox[1]
    header_line = None
    min_y_diff = float("inf")
    for i, line in enumerate(lines):
        line_y_max = line["bounding_box"][3]
        if line_y_max <= table_y_min and table_y_min - line_y_max < min_y_diff:
            min_y_diff = table_y_min - line_y_max
            header_line = i
    if header_line is not None and min_y_diff < 50:
        table_headers[table_key] = header_line

# Group lines by left and right side
midpoint = image_width / 2
left_lines = []
right_lines = []
for i, line in enumerate(lines):
    line_x_min = line["bounding_box"][0]
    if line_x_min < midpoint:
        left_lines.append(i)
    else:
        right_lines.append(i)

# Process lines
line_texts = [[] for _ in lines]
for i, line in enumerate(lines):
    if i in table_headers.values():
        continue  # Skip header lines, processed with tables
    bbox = line["bounding_box"]
    texts = extract_text_boxes(image, bbox)
    line_texts[i] = texts

# Process tables
table_texts = {key: [] for key in tables}
for table_key, table_coords in tables.items():
    table_bbox = [float(x) for x in table_key.strip("()").split(",")]
    texts = extract_text_boxes(image, table_bbox)
    table_texts[table_key] = texts
    # Process header
    header_idx = table_headers.get(table_key)
    if header_idx is not None:
        header_bbox = lines[header_idx]["bounding_box"]
        header_texts = extract_text_boxes(image, header_bbox)
        line_texts[header_idx] = header_texts

# Format output
formatted_text = []

# Process left-side lines (French, left-to-right)
for line_idx in sorted(left_lines, key=lambda i: lines[i]["bounding_box"][1]):
    if line_idx in table_headers.values():
        continue
    texts = line_texts[line_idx]
    if not texts:
        continue
    texts.sort(key=lambda t: t["box"][0])  # Left-to-right
    line_text = " ".join(t["text"] for t in texts if t["text"].strip())
    if line_text:
        formatted_text.append(line_text)

# Process right-side lines (Arabic, right-to-left)
for line_idx in sorted(right_lines, key=lambda i: lines[i]["bounding_box"][1]):
    if line_idx in table_headers.values():
        continue
    texts = line_texts[line_idx]
    if not texts:
        continue
    texts.sort(key=lambda t: -t["box"][0])  # Right-to-left for Arabic
    line_text = " ".join(t["text"] for t in texts if t["text"].strip())
    if line_text:
        formatted_text.append(line_text)

# Process tables
for table_key, table_coords in tables.items():
    table_bbox = [float(x) for x in table_key.strip("()").split(",")]
    texts = table_texts[table_key]
    if not texts:
        continue
    col_names = []
    col_positions = []
    header_idx = table_headers.get(table_key)
    if header_idx is not None:
        header_texts = line_texts[header_idx]
        header_texts.sort(key=lambda t: t["box"][0])
        col_names = [t["text"] for t in header_texts if t["text"].strip()]
        col_positions = [(t["box"][0] + t["box"][2]) / 2 for t in header_texts]
    texts.sort(key=lambda t: t["box"][1])
    rows = []
    current_row = []
    current_y = None
    y_tolerance = 20
    for text in texts:
        y = text["box"][1]
        if current_y is None or abs(y - current_y) < y_tolerance:
            current_row.append(text)
            current_y = y if current_y is None else current_y
        else:
            if current_row:
                rows.append(current_row)
            current_row = [text]
            current_y = y
    if current_row:
        rows.append(current_row)
    aligned_rows = []
    for row in rows:
        row.sort(key=lambda t: t["box"][0])
        row_positions = [(t["box"][0] + t["box"][2]) / 2 for t in row]
        aligned_row = [""] * len(col_names)
        for text, pos in zip([t["text"] for t in row], row_positions):
            if col_positions:
                closest_col_idx = min(range(len(col_positions)), key=lambda i: abs(col_positions[i] - pos))
                aligned_row[closest_col_idx] = text
        aligned_rows.append(aligned_row)
    max_widths = [max(len(col_names[i]) if i < len(col_names) else 0,
                      max((len(row[i]) if i < len(row) else 0) for row in aligned_rows))
                  for i in range(max(len(col_names), max(len(row) for row in aligned_rows)))]
    if col_names:
        formatted_text.append(f"\nTable {table_key}:")
        header_row = " | ".join(col_names[i].ljust(max_widths[i]) if i < len(col_names) else "".ljust(max_widths[i])
                               for i in range(len(max_widths)))
        formatted_text.append(header_row)
        formatted_text.append("-" * len(header_row))
    for row in aligned_rows:
        row_text = " ".join(row[i].ljust(max_widths[i]) if i < len(row) else "".ljust(max_widths[i])
                           for i in range(len(max_widths)))
        formatted_text.append(row_text)

# Save and print formatted text
with open(OUTPUT_TEXT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(formatted_text))

print("Formatted Text:")
print("\n".join(formatted_text))

Formatted Text:
Bulletin de soins N° 955051 7 بطاقة معالجة
réservé à l'adhérent بالمنخرط خاص بـ
Nom et prénom de l'adhérent ‎AU‏ ا Handwritten text not recognized لقب اب 6 9 اسم و
Numéro CIN ou passeport ©7521 عدد بطاقة التعريف الوطنية أو جواز السفر
عدد 16 سايل 2 4 9 0 8 — ——— اا
‎Ont‏
‎١ 0‏ عدد بطاقة التعريف الوطنية أو جوّار السفر أَجِج لسَّد سَّد لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ لِلصَّدِّ

In [5]:
import ollama
import os

# Configuration
EXTRACTED_TEXT_PATH = "document_text.txt"
FORMALIZED_TEXT_PATH = "formalized_text.txt"

# Read the extracted text
with open(EXTRACTED_TEXT_PATH, "r", encoding="utf-8") as f:
    extracted_text = f.read()

# Define the prompt for llama3 via Ollama
prompt = f"""
### Context:
You are an advanced language model tasked with cleaning up and formalizing text extracted from a medical care form (Bulletin de soins). The form contains information in both Arabic and French, related to patient details, hospital information, and a table of medical acts. The extracted text contains specific errors such as:
- Long sequences of numbers (e.g., '86, 87, 88, ...') that are hallucinations.
- Japanese text (e.g., '## 2020.04.18 ## ## 11 月 1 1 日') that doesn't belong.
- Misaligned or poorly formatted lines and tables.

### Task:
Your task is to:
1. Clean up the extracted text by removing only the specific hallucinations (long number sequences and Japanese text) while preserving all other valid content.
2. Correct language inconsistencies (ensure only Arabic and French text remains, with proper context).
3. Organize the text into a formal structure, respecting the form’s sections (e.g., patient info, hospital info, tables).
4. Ensure proper formatting for tables, with aligned columns and headers.
5. Preserve the original meaning and content as much as possible, avoiding deletion of valid text.

### Types of Corrections:
- **Remove Hallucinations**: Remove long sequences of numbers (e.g., '86, 87, 88, ...') and Japanese text (e.g., '## 2020.04.18 ## ## 11 月 1 1 日'), but keep all other text.
- **Language Consistency**: Ensure Arabic text is coherent and French text is coherent. Remove only non-Arabic/French text (e.g., Japanese).
- **Formal Structure**: Organize the text into clear sections (e.g., 'Patient Information', 'Hospital Information', 'Medical Acts Table').
- **Table Formatting**: Ensure the table has proper headers and aligned rows.
- **Text Correction**: Fix obvious OCR errors (e.g., 'Adhérent 00 Conjoint' should be separated into fields).

### Extracted Text:
{extracted_text}

### Output:
Provide the formalized text in a clean, structured format with sections and tables properly formatted, preserving all valid content except the specified hallucinations.
"""

# Use Ollama to process the extracted text with llama3
response = ollama.generate(
    model="llama3",
    prompt=prompt,
    options={
        "temperature": 0.0,  # Disable randomness for deterministic output
        "num_predict": 1500,  # Max tokens to generate
    }
)

# Extract the formalized text from the response
formalized_text = response["response"].split("### Output:")[-1].strip() if "### Output:" in response["response"] else response["response"].strip()

# Save the formalized text
with open(FORMALIZED_TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(formalized_text)

print("Formalized Text:")
print(formalized_text)

Formalized Text:
**Patient Information**

* **Name:** [Insert name]
* **Date of Birth:** [Insert date of birth]
* **Address:** [Insert address]

**Medical History**

* **Chief Complaint:** [Insert chief complaint]
* **Past Medical History:** [Insert past medical history]
* **Medications:** [Insert medications]

**Admission Information**

* **Date of Admission:** [Insert date of admission]
* **Time of Admission:** [Insert time of admission]
* **Reason for Admission:** [Insert reason for admission]

**Treatment and Services**

| Service | Date | Time | Provider |
| --- | --- | --- | --- |
| [Insert service] | [Insert date] | [Insert time] | [Insert provider] |

**Charges**

| Category | Amount |
| --- | --- |
| [Insert category] | [Insert amount] |
| [Insert category] | [Insert amount] |

**Insurance Information**

* **Insurance Company:** [Insert insurance company]
* **Policy Number:** [Insert policy number]
* **Effective Date:** [Insert effective date]

**Observations**

* [Insert obse