In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

import pytesseract
from PIL import Image, ImageDraw

pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/Cellar/tesseract/5.3.3/bin/tesseract'

def mask_text_cv2(cv2_image):
    # Convert the cv2 image (BGR) to PIL Image (RGB)
    rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_image)

    # Use pytesseract to do OCR on the image
    text_data = pytesseract.image_to_data(pil_image)

    # Create a drawing context
    draw = ImageDraw.Draw(pil_image)
    print(text_data.split('\n')[0])

    # Process the OCR data
    for line in text_data.split('\n')[1:]:
        if line.strip() == '':
            continue

        parts = line.split()
        print(parts)
        if len(parts) >= 12:
            x, y, width, height = map(int, parts[6:10])
            # Draw a white rectangle over the detected text
            draw.rectangle([x, y, x + width, y + height], fill="white")

    # Convert PIL Image back to cv2 format (BGR)
    masked_cv2_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    return masked_cv2_image


def check_match_images(src_img, web_img, visualize=False):
    # Read the images
    image_b = cv2.imread(web_img)
    image_b = mask_text_cv2(image_b)
    image_a = cv2.imread(src_img)

    # SIFT detector
    sift = cv2.SIFT_create()

    # Find keypoints and descriptors
    keypoints_a, descriptors_a = sift.detectAndCompute(image_a, None)
    keypoints_b, descriptors_b = sift.detectAndCompute(image_b, None)

    # FLANN based matcher
    index_params = dict(algorithm=1, trees=5)
    search_params = dict()
    flann = cv2.FlannBasedMatcher(index_params, search_params)

    matches = flann.knnMatch(descriptors_a, descriptors_b, k=2)

    # Keep good matches: Lowe's ratio test
    good_matches = []
    for m, n in matches:
        if m.distance < 0.7 * n.distance:
            good_matches.append(m)

    if len(good_matches) > 10: # adjust this threshold

        image_matches = cv2.drawMatches(image_a, keypoints_a, image_b, keypoints_b, good_matches, None, flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
    
        src_pts = np.float32([keypoints_a[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
        dst_pts = np.float32([keypoints_b[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)

        # Find homography
        M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)

        # Use the homography matrix M to transform the corners of Image A to Image B's plane
        h, w = image_a.shape[:2]
        pts = np.float32([[0, 0], [0, h - 1], [w - 1, h - 1], [w - 1, 0]]).reshape(-1, 1, 2)
        dst = cv2.perspectiveTransform(pts, M)

        # Draw the transformed image on Image B
        image_b_with_a = cv2.polylines(image_b, [np.int32(dst)], True, 255, 3, cv2.LINE_AA)

        # gray = cv2.cvtColor(image_b_with_a, cv2.COLOR_BGR2GRAY)
        if visualize:
            fig, ax = plt.subplots(figsize=(10, 10))
            ax.axis('off')
            plt.imshow(image_matches)
            plt.show()

        hb, wb = image_b.shape[:2]
        print(hb, wb)
        print(dst)

        # Extract scale and translation (approximate)
        scale_x = np.linalg.norm(dst[1] - dst[0]) / hb
        scale_y = np.linalg.norm(dst[2] - dst[1]) / wb
        translation = dst[0][0] / np.array([hb, wb])

        print(f"Relative height: {scale_x}, Relative width: {scale_y}")
        print(f"Top-Left Corner Coordinate: {translation}")
        return scale_x, scale_y, translation.tolist()
    else:
        print("Image not found!")
        return None, None, [None, None]
    

# check_match_images('../trial_dataset/rick.jpg', './diyi.png')
check_match_images('../trial_dataset/rick.jpg', './diyi_gpt4.png', True)

In [40]:
from PIL import Image
import numpy as np

file_path = "./diyi_gpt4.png"
template = file_path[:-4] + "{color}" + file_path[-4:]

image = Image.open(template.format(color="")).convert("RGB")
image_array = np.array(image)

image_red = Image.open(template.format(color="_red")).convert("RGB")
image_array_red = np.array(image_red)

image_blue = Image.open(template.format(color="_blue")).convert("RGB")
image_array_blue = np.array(image_blue)

./diyi_gpt4{color}.png


In [60]:
is_image = (image_array_red[:, :, 0] >= 250) & (image_array_red[:, :, 1] <= 5) & (image_array_red[:, :, 2] <= 5) & (image_array_blue[:, :, 0] <= 5) & (image_array_blue[:, :, 1] <= 5) & (image_array_blue[:, :, 2] >= 250)
is_image_coordinates = np.column_stack(np.where(is_image))

In [62]:
print(np.min(is_image_coordinates[:, 0]), np.max(is_image_coordinates[:, 0]), np.min(is_image_coordinates[:, 1]), np.max(is_image_coordinates[:, 1]))

20 119 1160 1259


In [2]:
from paddleocr import PaddleOCR, draw_ocr
import easyocr

reader = easyocr.Reader(['en'])

In [3]:
result = reader.readtext('./diyi.png', paragraph=True)
for item in result:
    print(item[0], item[1])

[[716, 29], [979, 29], [979, 206], [716, 206]] Diyi Yang diviv@cs stanford edu Computer Science Department Natural Lanquaae rocess Group- Stanford Universiti Gates 342
[[703, 256], [796, 256], [796, 274], [703, 274]] Publications
[[944, 256], [1012, 256], [1012, 274], [944, 274]] Teaching
[[188, 313], [375, 313], [375, 341], [188, 341]] Recent Preprints
[[224, 359], [801, 359], [801, 506], [224, 506]] Rehearsal: Simulating Conflict to Teach Conflict Resolution Omar Shaikh; Valentino Chai; Michele J. Gelfand; Yang Michael Bernstein arXiv.2309.12309 . [pdf] Can Large Language Models Transform Computational Social Science? Caleb Ziems William Held  Oma Shaikh Chen Zhehao ) Zhang Diyi Yang v2305,0351
[[226, 525], [998, 525], [998, 589], [226, 589]] Helping Helper: Supporting Peer Counselors via -Empowered Practice Feedbback Shana-Lina Saniav Shah Prathik Senthil Zahra Ashktorab Casev Duqan Werner Gever Div Yang arXiv:2305.08982. [pdfl
[[226, 608], [802, 608], [802, 672], [226, 672]] DvVal:

In [None]:
result = reader.readtext('./diyi_gpt4.png', paragraph=True)
for item in result:
    print(item[0], item[1])

[[17, 53], [134, 53], [134, 92], [17, 92]] Diyi Yang
[[16, 134], [284, 134], [284, 296], [16, 296]] diyiy@stanford.edu Computer Science Department Natural Language Processing Group Stanford University Google S2, 342
[[19, 313], [55, 313], [55, 327], [19, 327]] Home
[[74, 312], [111, 312], [111, 329], [74, 329]] Group
[[131, 313], [199, 313], [199, 327], [131, 327]] Publications
[[218, 311], [271, 311], [271, 329], [218, 329]] Teaching
[[18, 356], [168, 356], [168, 380], [18, 380]] Recent Preprints
[[19, 393], [337, 393], [337, 409], [19, 409]] Rehearsal: Simulating Chat to Facilitate Conflict Resolution
[[647, 411], [893, 411], [893, 429], [647, 429]] Using Large Language Models in Psychology
[[17, 441], [129, 441], [129, 461], [17, 461]] Publications


In [10]:
result = reader.readtext('./diyi.png', paragraph=True)
for item in result:
    print(item[0], item[1])

[[716, 29], [979, 29], [979, 206], [716, 206]] Diyi Yang diviv@cs stanford edu Computer Science Department Natural Lanquaae rocess Group- Stanford Universiti Gates 342
[[703, 256], [796, 256], [796, 274], [703, 274]] Publications
[[944, 256], [1012, 256], [1012, 274], [944, 274]] Teaching
[[188, 313], [375, 313], [375, 341], [188, 341]] Recent Preprints
[[224, 359], [801, 359], [801, 506], [224, 506]] Rehearsal: Simulating Conflict to Teach Conflict Resolution Omar Shaikh; Valentino Chai; Michele J. Gelfand; Yang Michael Bernstein arXiv.2309.12309 . [pdf] Can Large Language Models Transform Computational Social Science? Caleb Ziems William Held  Oma Shaikh Chen Zhehao ) Zhang Diyi Yang v2305,0351
[[226, 525], [998, 525], [998, 589], [226, 589]] Helping Helper: Supporting Peer Counselors via -Empowered Practice Feedbback Shana-Lina Saniav Shah Prathik Senthil Zahra Ashktorab Casev Duqan Werner Gever Div Yang arXiv:2305.08982. [pdfl
[[226, 608], [802, 608], [802, 672], [226, 672]] DvVal:

In [20]:
import cv2
import pytesseract
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher

pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/Cellar/tesseract/5.3.3/bin/tesseract'

def get_ocr_blocks(image_path):
    # This function will use OCR to extract text blocks and their bounding boxes from an image
    image = cv2.imread(image_path)
    img_h, img_w, _ = image.shape
    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    blocks = []
    for i in range(len(data['text'])):
        if int(data['conf'][i]) > 50:  # Consider blocks with confidence > 60%
            (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
            text = data['text'][i].strip()
            blocks.append({'text': text, 'bbox': (x / img_w, y / img_h, w / img_w, h / img_h)})
    return blocks

def match_blocks(blocks1, blocks2, v_scale=0.1):
    # This function will match blocks between two sets based on text similarity, spatial location, and size similarity
    matched_blocks = []
    max_distance = (1 + v_scale**2)**0.5
    
    for block1 in blocks1:
        best_match = None
        highest_score = 0
        
        for block2 in blocks2:
            # Text similarity
            text_similarity = SequenceMatcher(None, block1['text'], block2['text']).ratio()
            
            if text_similarity > 0.8:  # Text must be similar above a threshold
                
                # Spatial proximity (normalized by image dimensions for example)
                spatial_proximity = 1 - ((block1['bbox'][0] - block2['bbox'][0])**2 + (block1['bbox'][1] * v_scale - block2['bbox'][1] * v_scale)**2)**0.5 / max_distance
                
                # Size similarity
                # size_similarity = 1 - abs(block1['bbox'][2]*block1['bbox'][3] - block2['bbox'][2]*block2['bbox'][3]) / max(block1['bbox'][2]*block1['bbox'][3], block2['bbox'][2]*block2['bbox'][3])

                # Combine the scores with weights as needed
                # combined_score = (text_similarity * 0.6) + (spatial_proximity * 0.2) + (size_similarity * 0.2)
                combined_score = (text_similarity * 0.6) + (spatial_proximity * 0.4)

                print(block2)
                print(combined_score)

                if combined_score > highest_score:
                    highest_score = combined_score
                    best_match = block2

        if best_match:
            matched_blocks.append((block1, best_match))
        
        break
    
    return matched_blocks


def calculate_positional_score(bbox1, bbox2, v_scale=0.1):
    max_distance = (1 + v_scale**2)**0.5

    # Calculate the Euclidean distance between the center points of two bounding boxes
    center1 = (bbox1[0] + bbox1[2] / 2, bbox1[1] + bbox1[3] / 2)
    center2 = (bbox2[0] + bbox2[2] / 2, bbox2[1] + bbox2[3] / 2)
    distance = ((center1[0] - center2[0]) ** 2 + (center1[1] * v_scale - center2[1] * v_scale) ** 2) ** 0.5
    
    # Normalize distance based on a predefined max distance, this value could be tuned
    normalized_distance = min(distance / max_distance, 1)
    
    # Calculate score using exponential decay
    score = 1 - normalized_distance
    
    return score

In [13]:
def layout_consistency(image_path1, image_path2, v_scale=0.1):
    blocks1 = get_ocr_blocks(image_path1)
    blocks2 = get_ocr_blocks(image_path2)
    
    # matched_blocks = match_blocks(blocks1, blocks2, max_distance)
    matched_blocks = match_blocks(blocks1, blocks2, v_scale)
    print(matched_blocks[0])
    return
    
    positional_scores = [calculate_positional_score(block1['bbox'], block2['bbox'], v_scale) for block1, block2 in matched_blocks]
    
    if not positional_scores:
        return 0, []  # No matching blocks, so no consistency
    
    # Average score can be used as a consistency metric
    average_score = sum(positional_scores) / len(positional_scores)
    
    return average_score, matched_blocks

# Usage
# average_iou, matched_blocks = layout_consistency('./diyi_gpt4.png', './diyi.png', max_distance = 1.42)
# print(f"Layout consistency (Average IoU): {average_iou}")
layout_consistency('./diyi_gpt4.png', './diyi.png', v_scale=100)

[{'text': 'Diyi', 'bbox': (0.0171875, 0.08472222222222223, 0.03125, 0.03194444444444444)}, {'text': 'Yang', 'bbox': (0.0546875, 0.08472222222222223, 0.04375, 0.03194444444444444)}, {'text': 'diyiy@stanford.edu', 'bbox': (0.015625, 0.19305555555555556, 0.10625, 0.020833333333333332)}, {'text': 'Computer', 'bbox': (0.01640625, 0.24027777777777778, 0.05390625, 0.020833333333333332)}, {'text': 'Science', 'bbox': (0.07421875, 0.24027777777777778, 0.04375, 0.016666666666666666)}, {'text': 'Department', 'bbox': (0.12265625, 0.24027777777777778, 0.06484375, 0.020833333333333332)}, {'text': 'Natural', 'bbox': (0.01640625, 0.28888888888888886, 0.0390625, 0.016666666666666666)}, {'text': 'Language', 'bbox': (0.06015625, 0.28888888888888886, 0.0546875, 0.020833333333333332)}, {'text': 'Processing', 'bbox': (0.11953125, 0.28888888888888886, 0.06015625, 0.020833333333333332)}, {'text': 'Group', 'bbox': (0.184375, 0.28888888888888886, 0.03359375, 0.020833333333333332)}, {'text': 'Stanford', 'bbox': (

In [21]:
blocks1 = get_ocr_blocks('./diyi_gpt4.png')
blocks2 = get_ocr_blocks('./diyi.png')

In [23]:
def group_blocks_by_row(blocks, line_overlap_threshold=0.5):
    """
    Group blocks into rows based on their bounding box y-coordinates.
    Blocks that have y-overlapping bounding boxes within a threshold are considered to be on the same row.

    :param blocks: List of block dictionaries with 'bbox' as one of the keys.
    :param line_overlap_threshold: Threshold for considering blocks to be on the same line (relative to image height).
    :return: A list of lists of blocks, with each inner list representing a row.
    """
    # Sort blocks by the top y-coordinate
    sorted_blocks = sorted(blocks, key=lambda b: b['bbox'][1])
    
    rows = []
    current_row = []
    
    for block in sorted_blocks:
        # If current_row is empty, start a new row with the current block
        if not current_row:
            current_row.append(block)
        else:
            # Compare the current block with the last block in the current row
            last_block_in_row = current_row[-1]
            # Calculate the vertical overlap between the two blocks
            top_y_current = block['bbox'][1]
            bottom_y_last = last_block_in_row['bbox'][1] + last_block_in_row['bbox'][3]
            vertical_overlap = max(0, bottom_y_last - top_y_current)
            
            # If there is enough overlap, add the block to the current row
            if vertical_overlap > line_overlap_threshold * min(last_block_in_row['bbox'][3], block['bbox'][3]):
                current_row.append(block)
            else:
                # Otherwise, the current block starts a new row
                rows.append(current_row)
                current_row = [block]
    
    # Add the last row if it's not empty
    if current_row:
        rows.append(current_row)
    
    return rows


def rank_based_match_blocks(blocks1, blocks2):
    # This function will match blocks between two sets based on text similarity, spatial location, and size similarity
    matched_blocks = []

    grouped_blocks1 = group_blocks_by_row(blocks1)
    grouped_blocks2 = group_blocks_by_row(blocks2)


matched_blocks = rank_based_match_blocks(blocks1, blocks2)

[{'text': 'Diyi', 'bbox': (0.0171875, 0.08472222222222223, 0.03125, 0.03194444444444444)}, {'text': 'Yang', 'bbox': (0.0546875, 0.08472222222222223, 0.04375, 0.03194444444444444)}]
[{'text': 'diyiy@stanford.edu', 'bbox': (0.015625, 0.19305555555555556, 0.10625, 0.020833333333333332)}]
[{'text': 'Computer', 'bbox': (0.01640625, 0.24027777777777778, 0.05390625, 0.020833333333333332)}, {'text': 'Science', 'bbox': (0.07421875, 0.24027777777777778, 0.04375, 0.016666666666666666)}, {'text': 'Department', 'bbox': (0.12265625, 0.24027777777777778, 0.06484375, 0.020833333333333332)}]
[{'text': 'Natural', 'bbox': (0.01640625, 0.28888888888888886, 0.0390625, 0.016666666666666666)}, {'text': 'Language', 'bbox': (0.06015625, 0.28888888888888886, 0.0546875, 0.020833333333333332)}, {'text': 'Processing', 'bbox': (0.11953125, 0.28888888888888886, 0.06015625, 0.020833333333333332)}, {'text': 'Group', 'bbox': (0.184375, 0.28888888888888886, 0.03359375, 0.020833333333333332)}]
[{'text': 'Stanford', 'bbox