In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
import json
import os
from PIL import Image
from rank_bm25 import BM25Okapi
import re

In [2]:
# Load the original JSON data
with open(os.path.join(os.getcwd(), 'dataset', 'images' , 'final_dataset.json'), 'r') as f:
    data = json.load(f)

for image in data['images']:

    # Load the image and perform OCR
    image_path = image['local_path']
    img = Image.open(image_path)
    image_id = image['id']
    extracted_text = pytesseract.image_to_string(img)

    # Directory to save OCR text files
    ocr_save_dir = os.path.join(os.getcwd(), 'dataset', 'ocr_texts')
    os.makedirs(ocr_save_dir, exist_ok=True)

    # Split text by "\n\n" to create passages
    passages = extracted_text.split("\n\n")

    # Prepare OCR JSON file content with passage IDs
    ocr_json = {
        "image_id": image_id,
        "passages": [{"passage_id": i + 1, "text": passage.strip()} for i, passage in enumerate(passages) if passage.strip()]
    }

    # Save OCR JSON file
    ocr_file_name = f"{image_id}_ocr.json"
    ocr_file_path = os.path.join(ocr_save_dir, ocr_file_name)
    with open(ocr_file_path, 'w') as ocr_file:
        json.dump(ocr_json, ocr_file, indent=4)

    # Update the main JSON entry with the path to the OCR JSON file
    image['ocr_json_path'] = ocr_file_path

    # Save the updated dataset with new OCR path after each image
    with open(os.path.join(os.getcwd(), 'dataset', 'images' , 'final_dataset.json'), 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Processed and saved OCR data for image {image_id} to {ocr_file_path}")
    # break

print("All images processed and final dataset updated.")

Processed and saved OCR data for image 1 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\1_ocr.json
Processed and saved OCR data for image 2 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\2_ocr.json
Processed and saved OCR data for image 3 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\3_ocr.json
Processed and saved OCR data for image 4 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\4_ocr.json
Processed and saved OCR data for image 5 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\5_ocr.json
Processed and saved OCR data for image 6 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\6_ocr.json
Processed and saved OCR data for image 7 to c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\ocr_texts\7_ocr.json
Processed and saved OCR data for image 8 

In [11]:
# Path to the directory containing OCR JSON files
ocr_json_dir = os.path.join(os.getcwd(), 'dataset', 'ocr_texts')
ocr_img_dir = os.path.join(os.getcwd(), 'dataset', 'final_dataset')

# List to store tokenized passages along with image information
tokenized_extracted_text_list = []
image_info_list = []  # List to store image info (id, file path)

# Iterate through all OCR JSON files in the directory
for ocr_file in os.listdir(ocr_json_dir):
    ocr_file_path = os.path.join(ocr_json_dir, ocr_file)
    
    # Load OCR JSON file
    with open(ocr_file_path, 'r') as f:
        ocr_data = json.load(f)
    
    # Extract image info (id and path from the OCR JSON file or main dataset)
    image_id = ocr_data.get('image_id')
    # You can modify the image path extraction logic based on your actual dataset
    image_path = os.path.join(ocr_img_dir, f"{image_id}.jpg")  # Assuming the image file name is based on the image_id

    # Extract passages from the OCR JSON
    for passage in ocr_data['passages']:
        # Tokenize passage text (remove punctuation and non-word characters)
        tokenized_passage = re.findall(r'\b\w+\b', passage['text'].lower())
        if(len(tokenized_passage) < 10):
            continue
        tokenized_extracted_text_list.append(tokenized_passage)
        passage_id = passage['passage_id']
        image_info_list.append((image_id, passage_id ,image_path))  # Store image id and path for each passage

bm25 = BM25Okapi(tokenized_extracted_text_list)
# Print tokenized list
# print(tokenized_extracted_text_list)

In [12]:
query = "university of paris"
tokenized_query = query.split(" ")

scores = bm25.get_scores(tokenized_query)

top_n = 5
top_passages_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]

print(f"Top {top_n} Passages (based on BM25 scores):")
for idx in top_passages_idx:
    passage = ' '.join(tokenized_extracted_text_list[idx])  # Join tokens back into a string
    image_id, passage_id ,image_path = image_info_list[idx]  # Get the image id and path for the passage
    print(f"Passage: {passage}")
    print(f'Passage ID: {passage_id}')
    print(f"Source Image ID: {image_id}")
    print(f"Image Path: {image_path}\n")

Top 5 Passages (based on BM25 scores):
Passage: university of paris and fs the author of remarkable studies on the social aspects of finglish literature especially on the novels of the middle of the nineteenth century he will speak on the unity of france the france of today and tomorrow and the personality of france
Passage ID: 25
Source Image ID: 6
Image Path: c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\final_dataset\6.jpg

Passage: university of paris now of columbia univer sity he is a world wide traveler he has traveled all over europe america asia and contributed several enlightening studies of comparative literature the subjects of lectures are human ea in french literature and modern french poetry
Passage ID: 23
Source Image ID: 6
Image Path: c:\Users\sidda\OneDrive\Documents\DL_project\Document_Retrieval\dataset\final_dataset\6.jpg

Passage: emanuel de martonne of the i tniversity of paris exchange professor at 3 é columbian 1916 of him the bulletin 