In [1]:
import fitz
import pats
from unidecode import unidecode
import sys, os, re
from pprint import pprint


In [2]:

def parse(pdf: fitz.Document) -> (str, dict):
    s, idx, d = [], 0, {}
    for page in pdf.pages():
        tdict = page.get_text(
            'rawdict',
            flags=fitz.TEXTFLAGS_RAWDICT & ~fitz.TEXT_PRESERVE_IMAGES
        )
        for block in tdict['blocks']:
            for line in block['lines']:
                for span in line['spans']:
                    for char in span['chars']:
                        d[idx] = (page.number, *char['bbox'], span)
                        c = char['c']
                        uc = unidecode(c)
                        s.append(uc)
                        idx += len(uc)
                prev_span = line['spans'][-1]
                prev_char = prev_span['chars'][-1]
                if prev_char['c'] == '-':
                    for i in range(-1, -50, -1):
                        if s[i] == ' ':
                            s[i] = '\n'
                            break
                    s.pop()
                    idx -= 1
                else:
                    d[idx] = (page.number, *prev_char['bbox'], prev_span)
                    s.append('\n')
                    idx += 1
    return ''.join(s), d


In [4]:
#pdf = fitz.open('apa-nolinks.pdf')
pdf = fitz.open('apa-std-1.pdf')
pdf_text, char_info = parse(pdf)
with open('out.txt', 'wb') as f:
    f.write(pdf_text.encode('utf-8', errors='ignore'))

In [4]:
pat = pats.patterns['ieee']

# Find reference list starting point
m_ref_list = list(re.finditer(pats.start_of_ref_list, pdf_text, re.M))
assert(len(m_ref_list) == 1)
m_ref_list = m_ref_list[0]

# Find all reference destinations
dst = {}
for m_dst in re.finditer(pat['dst'], pdf_text[m_ref_list.end():], re.M | re.S):
    pg, x0, y0, x1, y1, *_ = char_info[m_dst.start() + m_ref_list.end()]
    key = m_dst.groups()
    if key in dst.keys():
        print(f'Warning: duplicate reference key {key}')
    else:
        dst[key] = (pg, x0, y0)
        #pdf[pg].add_highlight_annot([x0, y0, x1, y1])

if pat['alt-ref']:
    for m in re.finditer(pat['alt-ref'], pdf_text, re.M | re.S):
        start, end = m.span()

        prev_y0 = char_info[start][2]
        segments = [list(char_info[start][0:5])]
        for i in range(start, end):
            pg, x0, y0, x1, y1, s = char_info[i]
            if y0 == prev_y0:
                segments[-1][3] = x1
            else:
                segments.append(list(char_info[i][0:5]))
                prev_y0 = y0
            
        d_pg, d_x, d_y = dst.get(m.groups(), [None]*3)

        for s in segments:
            page = pdf[s[0]]
            h = page.add_highlight_annot(s[1:5])
            if not d_pg:
                h.set_colors({'stroke': (1,0,0), 'fill': None})
                h.update()
            if d_pg:
                page.insert_link({
                    'kind': fitz.LINK_GOTO,
                    'from': fitz.Rect(*s[1:5]), 
                    'page': d_pg,
                    'to': fitz.Point(d_x, d_y),
                })

# Create links for references
for m_refs in re.finditer(pat['refs'], pdf_text, re.M | re.S):
    for m in re.finditer(pat['ref'], m_refs.group()):

        start, end = m.span()[0] + m_refs.start(), m.span()[1] + m_refs.start()

        prev_y0 = char_info[start][2]
        segments = [list(char_info[start][0:5])]
        for i in range(start, end):
            pg, x0, y0, x1, y1, s = char_info[i]
            if y0 == prev_y0:
                segments[-1][3] = x1
            else:
                segments.append(list(char_info[i][0:5]))
                prev_y0 = y0
            
        d_pg, d_x, d_y = dst.get(m.groups(), [None]*3)

        for s in segments:
            page = pdf[s[0]]
            h = page.add_highlight_annot(s[1:5])
            if not d_pg:
                h.set_colors({'stroke': (1,0,0), 'fill': None})
                h.update()
            if d_pg:
                page.insert_link({
                    'kind': fitz.LINK_GOTO,
                    'from': fitz.Rect(*s[1:5]), 
                    'page': d_pg,
                    'to': fitz.Point(d_x, d_y),
                })

pdf.save('out.pdf')

In [5]:
pg_num = pdf[1]
tdict = pg_num.get_text('dict', flags=fitz.TEXTFLAGS_DICT & ~fitz.TEXT_PRESERVE_IMAGES)
with open('out.py', 'w') as f:
    pprint(tdict, f)

In [2]:
s = """
For example, Li et al. (2022b) provide a de?nition by decomposing
the adaptation cost into sample-ef?ciency and parameter-ef?ciency.
"""

for m in re.finditer(pats.apa['alt-ref'], s, re.DOTALL | re.MULTILINE):
    print(m.groups())

('Li', None, '2022b')


In [3]:
s = """
L?uddecke, T. and Ecker, A. (2022). Image segmentation using text and image prompts. In CVPR.
van den Oord, A., Vinyals, O., and Kavukcuoglu, K. (2017). Neural discrete representation learning.
In NeurIPS.
Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., and Bengio, Y. (2015).
Show, attend and tell: Neural image caption generation with visual attention. In ICML.
Chen, T. and Luo, J. (2020). Expressing objects just like words: Recurrent visual embedding for
image-text matching. In AAAI.
Chen, Y.-C., Li, L., Yu, L., El Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., and Liu, J. (2020d).
UNITER: Universal image-text representation learning. In ECCV.
Lin, C.-Y. (2004). Rouge: A package for automatic evaluation of summaries. In Text summarization
branches out.
Abu-El-Haija, S., Kothari, N., Lee, J., Natsev, P., Toderici, G., Varadarajan, B., and Vijaya-
narasimhan, S. (2016). Youtube-8m: A large-scale video classiﬁcation benchmark. arXiv preprint
arXiv:1609.08675.
Agarwal, V., Shetty, R., and Fritz, M. (2020). Towards causal vqa: Revealing and reducing spurious
correlations by invariant and covariant semantic editing. In CVPR.
Karpathy, A. and Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descrip-
tions. In CVPR.
"""

#print(pats.apa['dst'])

for m in re.finditer(pats.apa['dst'], s, re.DOTALL | re.MULTILINE):
    print(m.groups())


('L?uddecke', 'Ecker', '2022')
('van den Oord', None, '2017')
('Xu', None, '2015')
('Chen', 'Luo', '2020')
('Chen', None, '2020d')
('Lin', None, '2004')
('Abu-El-Haija', None, '2016')
('Agarwal', None, '2020')
('Karpathy', 'Fei-Fei', '2015')
