In [None]:
import os
import random
import shutil
import time
from tqdm import tqdm
import pytesseract
import pandas as pd
from PIL import Image as PILImage
from wand.image import Image as WandImage
import cv2
from pytesseract import Output

In [None]:
def generate_image_sample(input_image_folder, output_image_folder, num_samples=20):
    os.makedirs(output_images_folder, exist_ok=True)

    input_images_list = os.listdir(input_images_folder)
    random_selection = random.sample(input_images_list, num_samples)
    
    for index, image_name in enumerate(random_selection):
        input_path = os.path.join(input_images_folder, image_name)
        output_path = os.path.join(output_images_folder, f"{index+1}.png")
        shutil.move(input_path, output_path)

In [None]:
def generate_character_bounding_boxes(input_image_folder, result_image_folder):
    os.makedirs(result_image_folder, exist_ok=True)
    
    image_list = os.listdir(input_image_folder)
    
    for image_name in image_list:
        image_path = os.path.join(input_image_folder, image_name)
        bb_image_path = os.path.join(result_image_folder, image_name)
        
        image = cv2.imread(image_path)
        image_height = image.shape[0]
        image_width = image.shape[1]
    
        image_boxes = pytesseract.image_to_boxes(image, output_type=Output.DICT)
        if("char" not in image_boxes.keys()): 
            continue
            
        num_boxes = len(image_boxes["char"])
        for i in range(num_boxes):
            (x1, y2, x2, y1) = (image_boxes['left'][i], image_boxes['top'][i], image_boxes['right'][i], image_boxes['bottom'][i])
            cv2.rectangle(image, (x1, image_height - y1), (x2, image_height - y2) , (0,255,0), 0)
            
        cv2.imwrite(bb_image_path, image)

In [None]:
def generate_word_bounding_boxes(input_image_folder, result_image_folder):
    os.makedirs(result_image_folder, exist_ok=True)
    
    image_list = os.listdir(input_image_folder)
    
    for image_name in image_list:
        image_path = os.path.join(input_image_folder, image_name)
        bb_image_path = os.path.join(result_image_folder, image_name)
        
        image = cv2.imread(image_path)
        image_height = image.shape[0]
        image_width = image.shape[1]
    
        image_boxes = pytesseract.image_to_data(image, output_type=Output.DICT)
        
        if("text" not in image_boxes.keys()): 
            continue
            
        num_boxes = len(image_boxes["text"])
        for i in range(num_boxes):
            if(image_boxes["conf"][i] != -1):
                (x, y, w, h) = (image_boxes['left'][i], image_boxes['top'][i], image_boxes['width'][i], image_boxes['height'][i])
                cv2.rectangle(image, (x, y), (x + w, y + h) , (0,255,0), 1)
            
        cv2.imwrite(bb_image_path, image)

In [None]:
def add_noise(input_image_folder, result_image_folder):
    images_list = os.listdir(input_image_folder)

    os.makedirs(result_image_folder, exist_ok=True)

    for image_name in images_list:
        image_path = os.path.join(input_image_folder, image_name)
        noisy_image_path = os.path.join(result_image_folder, image_name)

        with WandImage(filename=image_path) as image:
            image.gaussian_blur(sigma=1.5)
            image.save(filename=noisy_image_path)

In [None]:
def get_text(folder, name):
    image_path = os.path.join(folder, name)
    
    image = cv2.imread(image_path)
    image_boxes = pytesseract.image_to_data(image, output_type=Output.DICT)
    print(image_boxes.keys())
    
    return image_boxes["text"]

def calculate_edit_distance(word1, word2):
    n = len(word1)
    m = len(word2)

    prev = [j for j in range(m+1)]
    curr = [0] * (m+1)

    for i in range(1, n+1):
        curr[0] = i
        for j in range(1, m+1):
            if word1[i-1] == word2[j-1]:
                curr[j] = prev[j-1]
            else:
                mn = min(1 + prev[j], 1 + curr[j-1])
                curr[j] = min(mn, 1 + prev[j-1])
        prev = curr.copy()

    return prev[m]

In [None]:
def compare_results(png_folder, jpeg_org_folder, jpeg_noise_folder, jpeg_org_sr_folder, jpeg_noise_sr_folder, csv_folder, num_images=None):
    os.makedirs(csv_folder, exist_ok=True)
    wrong_detections_folder = os.path.join(csv_folder, "wrong_detection")
    os.makedirs(wrong_detections_folder, exist_ok=True)

    if(num_images is None):
        num_images = len(os.listdir(png_folder))
    
    for i in tqdm(range(1, num_images + 1)):
        png_name = f"{i}.png"
        jpeg_name = f"{i}.jpeg"
        
        png_text = get_text(png_folder, png_name)
        jpeg_org_text = get_text(jpeg_org_folder, jpeg_name)
        jpeg_noise_text = get_text(jpeg_noise_folder, jpeg_name)
        jpeg_org_sr_text = get_text(jpeg_org_sr_folder, jpeg_name)
        jpeg_noise_sr_text = get_text(jpeg_noise_sr_folder, jpeg_name)
        zipped_text = zip(png_text, jpeg_org_text, jpeg_noise_text, jpeg_org_sr_text, jpeg_noise_sr_text)

        cmp_dict = {
            "png(gt)" : [],
            "jpeg_org" : [],
            "edit_dist_org" : [],
            "jpeg_noise" : [],
            "edit_dist_noise" : [],
            "jpeg_sr_org" : [],
            "edit_dist_sr_org" : [],
            "jpeg_sr_noise" : [],
            "edit_dist_sr_noise" : [],
        }

        for png_word, jpeg_org_word, jpeg_noise_word, jpeg_sr_org_word, jpeg_sr_noise_word in zipped_text:
            cmp_dict["png(gt)"].append(png_word)
            cmp_dict["jpeg_org"].append(jpeg_org_word)
            cmp_dict["jpeg_noise"].append(jpeg_noise_word)
            cmp_dict["jpeg_sr_org"].append(jpeg_sr_org_word)
            cmp_dict['jpeg_sr_noise'].append(jpeg_sr_noise_word)

            cmp_dict["edit_dist_org"].append(calculate_edit_distance(png_word, jpeg_org_word))
            cmp_dict["edit_dist_noise"].append(calculate_edit_distance(png_word, jpeg_noise_word))
            cmp_dict["edit_dist_sr_org"].append(calculate_edit_distance(png_word, jpeg_sr_org_word))
            cmp_dict["edit_dist_sr_noise"].append(calculate_edit_distance(png_word, jpeg_sr_noise_word))

            
        cmp_df = pd.DataFrame.from_dict(cmp_dict)
        csv_path = os.path.join(csv_folder, f"{i}.csv")
        cmp_df.to_csv(csv_path, index=False)
        error_df = cmp_df.loc[((cmp_df["edit_dist_org"] != 0) | (cmp_df["edit_dist_noise"] != 0) | (cmp_df["edit_dist_sr_org"] != 0) | (cmp_df["edit_dist_sr_noise"] != 0))]
        error_csv_path = os.path.join(wrong_detections_folder, f"{i}.csv")
        error_df.to_csv(error_csv_path, index=False)

# Processing PNG images

In [None]:
# Add noise to the sampled images

current_dir = os.getcwd()
original_image_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_original")
noisy_image_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_noise")

add_noise(original_image_folder, noisy_image_folder)

# Generating character-level bounding boxes

In [None]:
# Run Tesseract on original images to get character-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_original")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_char_bb/lr_images_original_bb")

generate_character_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on noise-added original images to get character-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_noise")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_char_bb/lr_images_noise_bb")

generate_character_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution images to get character-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/png/images/sr_images_original")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_char_bb/sr_images_original_bb")

generate_character_bounding_boxes(sr_image_folder, sr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution noise-added images to get character-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/png/images/sr_images_noise")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_char_bb/sr_images_noise_bb")

generate_character_bounding_boxes(sr_image_folder, sr_bb_image_folder)

# Generating word-level bounding boxes

In [None]:
# Run Tesseract on original images to get word-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_original")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_word_bb/lr_images_original_bb")

generate_word_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on noise-added original images to get word-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_noise")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_word_bb/lr_images_noise_bb")

generate_word_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution images to get word-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/png/images/sr_images_original")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_word_bb/sr_images_original_bb")

generate_word_bounding_boxes(sr_image_folder, sr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution noise-added images to get word-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/png/images/sr_images_noise")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/png/images_word_bb/sr_images_noise_bb")

generate_word_bounding_boxes(sr_image_folder, sr_bb_image_folder)

# Processing JPEG images

In [None]:
# Add noise to the sampled images

current_dir = os.getcwd()
original_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_original")
noisy_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_noise")

add_noise(original_image_folder, noisy_image_folder)

# Generate character-level bounding boxes

In [None]:
# Run Tesseract on original images to get character-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_original")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_bb/lr_images_original_bb")

generate_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on noise-added original images to get character-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_noise")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_bb/lr_images_noise_bb")

generate_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution images to get character-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/sr_images_original")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_bb/sr_images_original_bb")

generate_bounding_boxes(sr_image_folder, sr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution noise-added images to get character-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/sr_images_noise")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_bb/sr_images_noise_bb")

generate_bounding_boxes(sr_image_folder, sr_bb_image_folder)

# Generate word-level character boxes

In [None]:
# Run Tesseract on original images to get word-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_original")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_word_bb/lr_images_original_bb")

generate_word_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on noise-added original images to get word-level bounding boxes

lr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_noise")
lr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_word_bb/lr_images_noise_bb")

generate_word_bounding_boxes(lr_image_folder, lr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution images to get word-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/sr_images_original")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_word_bb/sr_images_original_bb")

generate_word_bounding_boxes(sr_image_folder, sr_bb_image_folder)

In [None]:
# Run Tesseract on super-resolution noise-added images to get word-level bounding boxes

current_dir = os.getcwd()
sr_image_folder = os.path.join(current_dir, "screenshots/jpeg/images/sr_images_noise")
sr_bb_image_folder = os.path.join(current_dir, "screenshots/jpeg/images_word_bb/sr_images_noise_bb")

generate_word_bounding_boxes(sr_image_folder, sr_bb_image_folder)

In [None]:
current_dir = os.getcwd()
png_folder = os.path.join(current_dir, "screenshots/png/images/lr_images_original")
jpeg_org_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_original")
jpeg_noise_folder = os.path.join(current_dir, "screenshots/jpeg/images/lr_images_noise")
jpeg_org_sr_folder = os.path.join(current_dir, "screenshots/jpeg/images/sr_images_original")
jpeg_noise_sr_folder = os.path.join(current_dir, "screenshots/jpeg/images/sr_images_noise")
csv_folder = os.path.join(current_dir, "screenshots/comparison_csv")

compare_results(png_folder, jpeg_org_folder, jpeg_noise_folder, jpeg_org_sr_folder, jpeg_noise_sr_folder, csv_folder)