In [1]:
#STEP 1: Preprocess PDF using a specific Python virtual environment

import subprocess

# Paths
pdf_file = r"C:\ocr-pipeline\step1_preprocess\cv.pdf"
output_folder = r"C:\ocr-pipeline\step1_preprocess\step1_output"
script_path = r"C:\ocr-pipeline\step1_preprocess\step1_preprocess_pdf.py"

# Use venv314 Python
venv_python = r"C:\ocr-pipeline\ocrproject\Scripts\python.exe"

# Run the script and capture output/errors
result = subprocess.run(
    [venv_python, script_path, pdf_file, output_folder],
    capture_output=True,  # Capture stdout and stderr
    text=True             # Return output as string (not bytes)
)

# Show all normal output from the script
print("=== STDOUT ===")
print(result.stdout)

# Show all error messages
print("=== STDERR ===")
print(result.stderr)

# Optional: check if the script succeeded
if result.returncode == 0:
    print("[DEBUG] Script ran successfully")
else:
    print(f"[ERROR] Script failed with return code {result.returncode}")


=== STDOUT ===
[STEP 1] Processed 1 pages â†’ C:\ocr-pipeline\step1_preprocess\step1_output

=== STDERR ===

[DEBUG] Script ran successfully


In [6]:
# =====================================================
# STEP 2 CALLER — Run OCR script using a specific venv
# =====================================================

import subprocess

# -------------------------------
# Paths
# -------------------------------
input_folder = r"C:\ocr-pipeline\step1_preprocess\step1_output"     # Folder with processed_*.png images
output_folder = r"C:\ocr-pipeline\step2_ocr\step2_output"               # Folder to save OCR output
script_path = r"C:\ocr-pipeline\step2_ocr\step2_ocr_paddle.py"      # Path to the OCR script
venv_python = r"C:\ocr-pipeline\ocrproject\Scripts\python.exe" # Python from your venv

# -------------------------------
# Run the OCR script
# -------------------------------
result = subprocess.run(
    [venv_python, script_path, input_folder, output_folder],
    capture_output=True,  # Capture stdout and stderr
    text=True             # Return output as string
)

# -------------------------------
# Show outputs
# -------------------------------
print("=== STDOUT ===")
print(result.stdout)

print("=== STDERR ===")
print(result.stderr)

# -------------------------------
# Check if the script succeeded
# -------------------------------
if result.returncode == 0:
    print("[DEBUG] OCR script ran successfully")
else:
    print(f"[ERROR] OCR script failed with return code {result.returncode}")


=== STDOUT ===
OCR â†’ processed_1.png
[STEP 2] OCR output saved â†’ C:\ocr-pipeline\step2_ocr\step2_output\cv_ocr_raw.json

=== STDERR ===
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m
INFO: Could not find files for the given pattern(s).
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
I0102 05:56:35.573367 22464 onednn_context.cc:81] oneDNN v3.6.2
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\u

In [3]:
# =====================================================
# STEP 3 CALLER — Run light OCR cleaning using a specific venv
# =====================================================

import subprocess

# -------------------------------
# Paths
# -------------------------------
input_json = r"C:\ocr-pipeline\step2_ocr\step2_output\cv_ocr_raw.json"  # Raw OCR JSON from Step 2
output_json = r"C:\ocr-pipeline\step3_lightclean\step3_output\cv_ocr_cleaned.json"  # Cleaned OCR output
script_path = r"C:\ocr-pipeline\step3_lightclean\step3_light_clean.py"  # Path to Step 3 script
venv_python = r"C:\ocr-pipeline\ocrproject\Scripts\python.exe"  # Python from your venv

# -------------------------------
# Run the Step 3 cleaning script
# -------------------------------
result = subprocess.run(
    [venv_python, script_path, input_json, output_json],
    capture_output=True,  # Capture stdout and stderr
    text=True             # Return output as string
)

# -------------------------------
# Show outputs
# -------------------------------
print("=== STDOUT ===")
print(result.stdout)

print("=== STDERR ===")
print(result.stderr)

# -------------------------------
# Check if the script succeeded
# -------------------------------
if result.returncode == 0:
    print("[DEBUG] Step 3 OCR cleaning ran successfully")
else:
    print(f"[ERROR] Step 3 OCR cleaning failed with return code {result.returncode}")


=== STDOUT ===
[STEP 3] Cleaned OCR saved â†’ C:\ocr-pipeline\step3_lightclean\step3_output\cv_ocr_cleaned.json

=== STDERR ===

[DEBUG] Step 3 OCR cleaning ran successfully


In [None]:
# =====================================================
# Run all steps for a single CV
# =====================================================

import sys
from pathlib import Path

# -------------------------------
# Step 1: Preprocess PDF → PNG images
# -------------------------------
step1_folder = r"C:\ocr-pipeline\step1_preprocess"
sys.path.append(step1_folder)

from step1_preprocess_pdf import main as preprocess_pdf

preprocess_pdf(
    r"C:\ocr-pipeline\step1_preprocess\cv.pdf",
    r"C:\ocr-pipeline\step1_preprocess\step1_output",
    poppler_path=r"C:\poppler\Library\bin"
)

# -------------------------------
# Step 2: OCR on processed images
# -------------------------------
step2_folder = r"C:\ocr-pipeline\step2_ocr"
sys.path.append(step2_folder)

from step2_ocr_paddle import run_ocr  # import the function

run_ocr(
    r"C:\ocr-pipeline\step1_preprocess\step1_output",
    r"C:\ocr-pipeline\step2_ocr\step2_output"
)

# -------------------------------
# Step 3: Light cleaning OCR results
# -------------------------------
step3_folder = r"C:\ocr-pipeline\step3_lightclean"
sys.path.append(step3_folder)
from step3_light_clean import clean_ocr_json

clean_ocr_json(
    r"C:\ocr-pipeline\step2_ocr\step2_output\cv_ocr_raw.json",
    r"C:\ocr-pipeline\step3_lightclean\step3_output\cv_ocr_cleaned.json"
)

[STEP 1] Processed 1 pages → C:\ocr-pipeline\step1_preprocess\step1_output


  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the direc

OCR → processed_1.png
[STEP 2] OCR output saved → C:\ocr-pipeline\step2_ocr\step2_output\cv_ocr_raw.json
[STEP 3] Cleaned OCR saved → C:\ocr-pipeline\step3_lightclean\step3_output\cv_ocr_cleaned.json


[{'image': 'processed_1.png',
  'lines': [{'text': 'EXPERIENCES / PROJETS',
    'confidence': 0.9983910918235779,
    'bbox': None},
   {'text': 'APPRENTISSAGE EN DATA SCIENCE - AUTOFORMATION /',
    'confidence': 0.9963175058364868,
    'bbox': None},
   {'text': 'PROJETS PERSONNELS',
    'confidence': 0.9998707175254822,
    'bbox': None},
   {'text': '2024 - Present', 'confidence': 0.9953369498252869, 'bbox': None},
   {'text': 'Formation en Data Science et Machine Learning (Python, Pandas)',
    'confidence': 0.9953547716140747,
    'bbox': None},
   {'text': "FORMATEUR / RESPONSABLE PEDAGOGIQUE - SUP'CENTER,",
    'confidence': 0.9918887615203857,
    'bbox': None},
   {'text': 'MARRAKECH', 'confidence': 0.9997824430465698, 'bbox': None},
   {'text': '2019 - 2022', 'confidence': 0.9818785190582275, 'bbox': None},
   {'text': 'NAJI', 'confidence': 0.9994564056396484, 'bbox': None},
   {'text': 'Soutien en eco internationale/developpement et informatique de gestion',
    'confidence

In [1]:
# =====================================================
# STEP 3 CALLER — Batch process multiple CVs
# =====================================================


import sys
from pathlib import Path

# =====================================================
# CONFIG
# =====================================================
CV_INPUT_DIR = Path(r"C:\ocr-pipeline\caller_batch\cvs_input")

BASE_STEP1_OUT = Path(r"C:\ocr-pipeline\caller_batch\step1_batch_output")
BASE_STEP2_OUT = Path(r"C:\ocr-pipeline\caller_batch\step2_batch_output")
BASE_STEP3_OUT = Path(r"C:\ocr-pipeline\caller_batch\step3_batch_output")

POPPLER_PATH = r"C:\poppler\Library\bin"

# =====================================================
# STEP 1 IMPORT
# =====================================================
step1_folder = r"C:\ocr-pipeline\step1_preprocess"
sys.path.append(step1_folder)
from step1_preprocess_pdf import main as preprocess_file  # updated: handles PDF + images

# =====================================================
# STEP 2 IMPORT
# =====================================================
step2_folder = r"C:\ocr-pipeline\step2_ocr"
sys.path.append(step2_folder)
from step2_ocr_paddle import run_ocr

# =====================================================
# STEP 3 IMPORT
# =====================================================
step3_folder = r"C:\ocr-pipeline\step3_lightclean"
sys.path.append(step3_folder)
from step3_light_clean import clean_ocr_json

# =====================================================
# BATCH PIPELINE
# =====================================================
# Accept PDFs + PNG + JPG + JPEG
VALID_EXTENSIONS = [".pdf", ".png", ".jpg", ".jpeg"]

for file_path in CV_INPUT_DIR.glob("*"):
    if file_path.suffix.lower() not in VALID_EXTENSIONS:
        continue

    cv_name = file_path.stem
    print(f"\n=== Processing CV: {cv_name} ===")

    # Per-CV folders
    step1_out = BASE_STEP1_OUT / cv_name
    step2_out = BASE_STEP2_OUT / cv_name
    step3_out = BASE_STEP3_OUT / cv_name
    step3_out.mkdir(parents=True, exist_ok=True)

    # -------------------------------
    # Step 1: PDF / Image → PNG (preprocessing)
    # -------------------------------
    try:
        preprocess_file(
            str(file_path),
            str(step1_out),
            poppler_path=POPPLER_PATH
        )
    except Exception as e:
        print(f"[ERROR] Step 1 failed for {cv_name}: {e}")
        continue

    # -------------------------------
    # Step 2: OCR
    # -------------------------------
    try:
        run_ocr(
            str(step1_out),
            str(step2_out)
        )
    except Exception as e:
        print(f"[ERROR] Step 2 failed for {cv_name}: {e}")
        continue

    # -------------------------------
    # Step 3: Light Clean
    # -------------------------------
    raw_json = step2_out / "cv_ocr_raw.json"
    clean_json = step3_out / "cv_ocr_cleaned.json"

    if not raw_json.exists():
        print(f"[SKIP] No OCR output for {cv_name}")
        continue

    try:
        clean_ocr_json(
            str(raw_json),
            str(clean_json)
        )
    except Exception as e:
        print(f"[ERROR] Step 3 failed for {cv_name}: {e}")
        continue

    print(f"[DONE] {cv_name}")


  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m



=== Processing CV: 11943065 ===
[STEP 1] Processed 2 pages → C:\ocr-pipeline\caller_batch\step1_batch_output\11943065


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\latin_PP-OCRv5_mobile_rec`.[0m


OCR → processed_1.png
OCR → processed_2.png
[STEP 2] OCR output saved → C:\ocr-pipeline\caller_batch\step2_batch_output\11943065\cv_ocr_raw.json
[STEP 3] Cleaned OCR saved → C:\ocr-pipeline\caller_batch\step3_batch_output\11943065\cv_ocr_cleaned.json
[DONE] 11943065

=== Processing CV: 12071138 ===


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m


[STEP 1] Processed 3 pages → C:\ocr-pipeline\caller_batch\step1_batch_output\12071138


[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\latin_PP-OCRv5_mobile_rec`.[0m


OCR → processed_1.png
OCR → processed_2.png
OCR → processed_3.png


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m


[STEP 2] OCR output saved → C:\ocr-pipeline\caller_batch\step2_batch_output\12071138\cv_ocr_raw.json
[STEP 3] Cleaned OCR saved → C:\ocr-pipeline\caller_batch\step3_batch_output\12071138\cv_ocr_cleaned.json
[DONE] 12071138

=== Processing CV: exemple-cv-etudiant-1 ===
[STEP 1] Processed 1 pages → C:\ocr-pipeline\caller_batch\step1_batch_output\exemple-cv-etudiant-1


[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\latin_PP-OCRv5_mobile_rec`.[0m


OCR → processed_1.png


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m


[STEP 2] OCR output saved → C:\ocr-pipeline\caller_batch\step2_batch_output\exemple-cv-etudiant-1\cv_ocr_raw.json
[STEP 3] Cleaned OCR saved → C:\ocr-pipeline\caller_batch\step3_batch_output\exemple-cv-etudiant-1\cv_ocr_cleaned.json
[DONE] exemple-cv-etudiant-1

=== Processing CV: exemple-cv-lyceen-1 ===
[STEP 1] Processed 1 pages → C:\ocr-pipeline\caller_batch\step1_batch_output\exemple-cv-lyceen-1


[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\latin_PP-OCRv5_mobile_rec`.[0m


OCR → processed_1.png


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m


[STEP 2] OCR output saved → C:\ocr-pipeline\caller_batch\step2_batch_output\exemple-cv-lyceen-1\cv_ocr_raw.json
[STEP 3] Cleaned OCR saved → C:\ocr-pipeline\caller_batch\step3_batch_output\exemple-cv-lyceen-1\cv_ocr_cleaned.json
[DONE] exemple-cv-lyceen-1

=== Processing CV: Screenshot 2026-01-01 212822 ===
[STEP 1] Processed 1 pages → C:\ocr-pipeline\caller_batch\step1_batch_output\Screenshot 2026-01-01 212822


[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\undop\.paddlex\official_models\latin_PP-OCRv5_mobile_rec`.[0m


OCR → processed_1.png
[STEP 2] OCR output saved → C:\ocr-pipeline\caller_batch\step2_batch_output\Screenshot 2026-01-01 212822\cv_ocr_raw.json
[STEP 3] Cleaned OCR saved → C:\ocr-pipeline\caller_batch\step3_batch_output\Screenshot 2026-01-01 212822\cv_ocr_cleaned.json
[DONE] Screenshot 2026-01-01 212822


In [1]:
# =====================================================
# Step 5: Embeddings
# =====================================================
import sys
from pathlib import Path

step5_folder = r"C:\ocr-pipeline\step5_embeddings"
sys.path.append(step5_folder)

from step5_embeddings import main as build_embeddings

build_embeddings(
    r"C:\ocr-pipeline\step4_rag\step4_output\cv_rag_blocks.json",
    r"C:\ocr-pipeline\step5_embeddings\step5_output"
)


  from .autonotebook import tqdm as notebook_tqdm


[STEP 5] Stored 3 embeddings in Chroma


In [2]:
# =====================================================
# Step 6: Retrieval (Chroma)
# =====================================================
import sys
from pathlib import Path

step6_folder = r"C:\ocr-pipeline\step6_retrieval"
sys.path.append(step6_folder)

from step6_retrieval import main as retrieve_blocks

retrieve_blocks(
    r"C:\ocr-pipeline\step5_embeddings\step5_output",
    "What experience does the candidate have in data science?",
    3
)


[STEP 6] Retrieved blocks:

Doc: cv_001 | Page: 1
EXPERIENCES / PROJETS APPRENTISSAGE EN DATA SCIENCE - AUTOFORMATION / PROJETS PERSONNELS 2024 - Present Formation en Data Science et Machine Learning (Python, Pandas) FORMATEUR / RESPONSABLE PEDAGOGIQUE - SUP'CENTER, MARRAKECH 2019 - 2022 NAJI Soutien en eco internationale/developpement et informati 

Doc: cv_001 | Page: 1
Nr o2, Elizdihar, Marrakech 2005 - 2008 PRIMAIRE, SAFI 06.05.32.81.59 2EME A 6EME ANNEE : ECOLE PRIMAIRE IBN BATOUTA, 1ERE ANNEE : ECOLE PRIMAIRE ALLAL BEN ABDELLAH, 1999 - 2005 tps://github.com/TakiEddineNaji Takieddine,naji@gmail.com CERTIFICATIONS & FORMATIONS COMPLEMENTAIRES COMPETENCES IBM Data 

Doc: cv_001 | Page: 1
Economiste de formation. appliquant Developpement des competences en communication et argumentation mes competences analytiques s et mon savoir-faire a des projets concrets avec capacite d'a adaptation selon le poste. EDUCATION LANGUES MASTER DE RECHERCHE EN ECONOMIE INTERNATIONALE, GOUVERNANCE ET 

{'ids': [['block_0', 'block_2', 'block_1']],
 'embeddings': None,
 'documents': [["EXPERIENCES / PROJETS APPRENTISSAGE EN DATA SCIENCE - AUTOFORMATION / PROJETS PERSONNELS 2024 - Present Formation en Data Science et Machine Learning (Python, Pandas) FORMATEUR / RESPONSABLE PEDAGOGIQUE - SUP'CENTER, MARRAKECH 2019 - 2022 NAJI Soutien en eco internationale/developpement et informatique de gestion Enseignement en ligne et presentiel TAKI Gestion des reseaux sociaux et contenus graphiques EDDINE Developpement des competences pedagogiques et communication PROJETS FREELANCE / PERSONNEL 2014 - 2022 (approx.) ECONOMISTE / ANALYSTE Personnalisation et publication d'applications mobiles Technicien en Maintenance Informatique Conception, assemblage et revente de composants PC Design graphique e et projets numeriques personnels (logos,mini-sites web DEBAT / COMMUNICATION - CLC OPEN DEBATE CENTER A PROPOS DE MOI MARRAKECH 2013 -2014 Finaliste et meilleur orateur en anglais",
   'Nr o2, Elizdihar, M