In [None]:
import layoutparser as lp
import pymupdf
from PIL import Image
import cv2
import numpy as np
import json
from tqdm import tqdm
import pickle

import faiss
import torch
from transformers import CLIPProcessor, CLIPModel, CLIPConfig, CLIPTokenizer, AutoTokenizer, AutoModelForSequenceClassification
from langchain_text_splitters import RecursiveCharacterTextSplitter
import voyageai

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
device = 'gpu' if torch.cuda.is_available() else 'cpu'

### Model experimentation

In [None]:
# Experiment section: Q-A over 3 documents
from document_analysis import DocumentAnalysis
import numpy as np
import json

# Helper functions
# Formatting Q-A pairs with COCO annotations
# Formula for IoU
def calculate_iou(box1, box2):
    box1 = np.array(box1, dtype=np.float32)
    box2 = np.array(box2, dtype=np.float32)

    # Compute intersection coordinates
    inter_x_min = np.maximum(box1[0], box2[0])
    inter_y_min = np.maximum(box1[1], box2[1])
    inter_x_max = np.minimum(box1[2], box2[2])
    inter_y_max = np.minimum(box1[3], box2[3])

    # Compute intersection area
    inter_width = np.maximum(0, inter_x_max - inter_x_min)
    inter_height = np.maximum(0, inter_y_max - inter_y_min)
    inter_area = inter_width * inter_height

    # Compute area of both boxes
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    # Compute union area with numerical stability
    union_area = np.maximum(box1_area + box2_area - inter_area, 1e-10)

    # Compute IoU
    iou = inter_area / union_area
    return np.clip(iou, 0.0, 1.0)

# Convert (x, y, width, height) to (x1, y1, x2, y2)
def coco_to_coordinates(bbox):
    x, y, width, height = bbox # unwrap
    return [x, y, x + width, y + height]


# Filename init
data_dir = "../data/"
data_annotations_dir = "../data/annotations/"
doc_list = ["1706.03762.pdf", "imagenet-classification.pdf", "2010.11929v2.pdf"]
annotations_list = ["annotations-attention", "annotations-imagenet", "annotations-vit"]
qa_pairs_list = ["qa-pairs-attention", "qa-pairs-imagenet", "qa-pairs-vit"]

# Metrics
# Total pages, questions and total regions fixed at 20*3 = 60
correct_pages_top1 = 0
correct_regions_top1 = 0
correct_regions_iou_top1 = 0.0
correct_pages_top5 = 0
correct_regions_top5 = 0
correct_regions_iou_top5 = 0.0

for doc_name, annotations_name, qa_pairs_name in zip(doc_list, annotations_list, qa_pairs_list):
    # if doc_name == "1706.03762.pdf": continue
    print(f"\n\nCurrent doc: {doc_name}")
    annotations = json.load(open(data_annotations_dir + annotations_name + '.json', 'r'))
    qa_pairs = json.load(open(data_annotations_dir + qa_pairs_name + '.json', 'r'))

    # New pipeline, read and process 
    pipeline = DocumentAnalysis(vector_dir = '../data/.vectorstore/')
    # doc = pipeline.read_from_path(data_dir + doc_name)
    # pipeline.process_document(doc)
    # pipeline.faiss_persist(subdir = doc_name + '/') # one-time, update schema
    pipeline.faiss_read(subdir = doc_name + '/') # If document has been processed and stored prior

    # Metrics for current paper
    cpt1_trial = 0
    crt1_trial = 0
    crtiou1_trial = 0.0
    cpt5_trial = 0
    crt5_trial = 0
    crtiou5_trial = 0.0

    # Q-A assessment
    verbose=True
    for qa in qa_pairs:
        qvalue, qpage, qbbox = qa['answer'].values()
        qbbox = tuple(coco_to_coordinates(qbbox)) # Standardize to LayoutParser bbox system

        answers = pipeline.search_faiss(qa['question'])
        atext = [a['content'] for a in answers]
        apages = [a['page']+1 for a in answers] # Add 1 to convert from index to numbering
        abboxes = [a['bbox'] for a in answers]

        # Verbose illustration
        if verbose:
            print(f'\nQuestion: {qa['question']}')
            # print(f'\nground: {qpage}, {qbbox}, {qa['question']}')
            for i in zip(atext, apages, abboxes):
                print(i)

        # Top-1 metric
        # apages is ordered in decreasing order
        if apages[0] == qpage:
            cpt1_trial += 1
            iou = calculate_iou(qbbox, abboxes[0])
            if verbose: print(f'iou_top1: {iou}')
            if iou > 0.5: 
                crt1_trial += 1
                crtiou1_trial += iou
        
        # Top-5 metric
        if qpage in apages:
            if verbose: print(qpage, apages)
            cpt5_trial += 1
            for apage, abbox in zip(apages, abboxes):
                if apage == qpage:
                    iou = calculate_iou(qbbox, abbox)
                    if verbose: print(f'iou_top5: {iou}')
                    if iou > 0.5: 
                        crt5_trial += 1
                        crtiou5_trial += iou
                        break # If correct found, skip remaining chunks
    # Print document-specific metrics
    qa_length = len(qa_pairs)
    print("Top-1")
    print(f'Correct pages: {cpt1_trial/qa_length}')
    print(f'Correct regions: {crt1_trial/qa_length}, IoU: {crtiou1_trial/crt1_trial}')
    print("Top-5")
    print(f"Correct pages: {cpt5_trial/qa_length}")
    print(f'Correct regions: {crt5_trial/qa_length}, IoU: {crtiou5_trial/crt5_trial}')
    print('\n\n')

    # Aggregate
    correct_pages_top1 += cpt1_trial
    correct_regions_top1 += crt1_trial
    correct_regions_iou_top1 += crtiou1_trial
    correct_pages_top5 += cpt5_trial
    correct_regions_top5 += crt5_trial
    correct_regions_iou_top5 += crtiou5_trial

# Print overall metrics
print("Top-1")
print(f'Correct pages: {correct_pages_top1/60}')
print(f'Correct regions: {correct_regions_top1/60}, IoU: {correct_regions_iou_top1/correct_regions_top1}')
print("Top-5")
print(f"Correct pages: {correct_pages_top5/60}")
print(f'Correct regions: {correct_regions_top5/60}, IoU: {correct_regions_iou_top5/correct_regions_top5}')


In [None]:
# Counting model sizes
# Import libraries in first cell

# Cross encoder size
crossencoder = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L6-v2")
print(f"Total number of parameters (Cross Encoder): {crossencoder.num_parameters()}")


# Detectron2 Model Size
detectron2_path = "C:\\Users\\lewis/.torch/iopath_cache\\s/d9fc9tahfzyl6df\\model_final.pth"
weights = torch.load(detectron2_path, map_location=torch.device("cpu"))
# If the file contains a Detectron2 model checkpoint, extract the state_dict
if "model" in weights:
    weights = weights["model"]
# Count total parameters
total_params = sum(p.numel() for p in weights.values())
print(f"Total number of parameters (Detectron2): {total_params}")

Total number of parameters (Detectron2): 44051248
