In [50]:
import en_core_web_sm
import fitz
import spacy

import re

In [40]:
nlp = en_core_web_sm.load()

In [41]:
color_map = {
    "PERSON": {"stroke": [1, 0, 0]},       # Red
    "NORP": {"stroke": [0, 1, 0]},         # Green
    "FAC": {"stroke": [0, 0, 1]},          # Blue
    "ORG": {"stroke": [1, 0.5, 0]},        # Orange
    "GPE": {"stroke": [0.5, 0, 1]},        # Purple
    "LOC": {"stroke": [0, 1, 1]},          # Cyan
    "PRODUCT": {"stroke": [1, 0, 1]},      # Magenta
    "EVENT": {"stroke": [0.5, 0.5, 0]},    # Olive
    "WORK_OF_ART": {"stroke": [0.3, 0.7, 0.3]}, # A shade of green
    "LAW": {"stroke": [0.7, 0.3, 0.3]},    # A shade of red
    "LANGUAGE": {"stroke": [0.3, 0.3, 0.7]}, # A shade of blue
    "DATE": {"stroke": [0.7, 0.7, 0.3]},   # A shade of yellow
    "TIME": {"stroke": [0.3, 0.7, 0.7]},   # A shade of cyan
    "PERCENT": {"stroke": [0.7, 0.3, 0.7]}, # A shade of magenta
    "MONEY": {"stroke": [0, 0.5, 0.5]},    # A darker shade of cyan
    "QUANTITY": {"stroke": [0.5, 0.5, 0.5]}, # Gray
    "ORDINAL": {"stroke": [0.5, 0.5, 0]},  # Olive
    "CARDINAL": {"stroke": [0, 0.5, 0]}    # Dark green
}

In [61]:
def highlight_and_annotate_spacy_entities(page, color_map):
    text = page.get_text("text")
    doc_spacy = nlp(text)

    for entity in doc_spacy.ents:
        color = color_map.get(entity.label_, {"stroke": [0.5, 0.5, 0.5]})
        
        # Search only for exact word matches using regex
        pattern = r'\b' + re.escape(entity.text) + r'\b'
        matches = [match.span() for match in re.finditer(pattern, text)]
        
        for match_start, match_end in matches:
            match_text = text[match_start:match_end]
            quads = page.search_for(match_text)
            
            for quad in quads:
                annot = page.add_highlight_annot(quad)
                annot.set_colors(color)
                annot.update()

In [62]:
nlp = spacy.load("en_core_web_sm")

doc = fitz.open("Data/timemach.pdf")
page = doc[4]

highlight_and_annotate_spacy_entities(page, color_map)

doc.save("highlighted.pdf")
doc.close()