# Citation Extractor

Given a pdf file, an extract of text in the file, fetch all the papers cited in the extract.

In [1]:
import fitz  # PyMuPDF
import re
from rapidfuzz import fuzz
from tqdm import tqdm
from time import sleep
import json
import requests

In [2]:
datadir = "/home/surya/NEU/CS5100 FAI/Project/pdfreader/"
file = datadir + "test.pdf"
doc = fitz.open(file)

In [3]:
extract = """
This is the same objective optimized in prior works [49, 38, 1, 26] using the DPO-equivalent reward
for the reward class of rφ . In this setting, we can interpret the normalization term in f (rφ, πref , β)
as the soft value function of the reference policy πref . While this term does not affect the optimal
solution, without it, the policy gradient of the objective could have high variance, making learning
unstable. We can accommodate for the normalization term using a learned value function, but that
can also be difficult to optimize. Alternatively, prior works have normalized rewards using a human
completion baseline, essentially a single sample Monte-Carlo estimate of the normalizing term. In
contrast the DPO reparameterization yields a reward function that does not require any baselines.
"""

extract = extract.strip()

In [4]:
extract = """
Visual object tracking studies the joint spatial-temporal localization of objects in videos. From a
video and a predefined taxonomy, multiple object tracking (MOT) models simultaneously detect,
recognize, and track multiple objects. For example, MOT [54] tracks humans, KITTI [25, 50] tracks
pedestrians and cars, and TAO [15] tracks a large taxonomy of 833 categories. In contrast, single
object tracking (SOT) follows a single object via a provided initial template of the object, without
any detection or recognition. Thus, SOT is often taxonomy-free and operates on generic objects. The
community has constructed multiple popular benchmarks to study this problem, including OTB [76],
UAV [56], NfS [36], TC-128 [45], NUS-PRO [41], GOT-10k [32], VOT [38], and TrackingNet [57].
"""

extract = extract.strip()

## Find the extract text in the document

In [5]:
THRESHOLD = 92

In [6]:
matches = []

for page_num in tqdm(range(len(doc)), leave=False):
    page = doc.load_page(page_num)  # load the current page
    text_blocks = page.get_text_blocks()  # get a list of links on the current page
    for block in text_blocks:
        text = block[4]

        match_score = fuzz.partial_ratio(extract, text)

        if match_score >= THRESHOLD:
            matches.append((block, page_num, match_score))

matches = sorted(matches, key=lambda x: x[2], reverse=True)

                                                                                                                                                         

Remove matches that are too small

In [7]:
MINTEXTLEN = 10

In [8]:
matches = filter(lambda x: len(x[0][4]) > MINTEXTLEN, matches)
matches = list(matches)

In [9]:
matches

[((107.64099884033203,
   335.8719482421875,
   505.5771789550781,
   438.9045104980469,
   '2.1\nVisual object tracking datasets\nVisual object tracking studies the joint spatial-temporal localization of objects in videos. From a\nvideo and a predefined taxonomy, multiple object tracking (MOT) models simultaneously detect,\nrecognize, and track multiple objects. For example, MOT [54] tracks humans, KITTI [25, 50] tracks\npedestrians and cars, and TAO [15] tracks a large taxonomy of 833 categories. In contrast, single\nobject tracking (SOT) follows a single object via a provided initial template of the object, without\nany detection or recognition. Thus, SOT is often taxonomy-free and operates on generic objects. The\ncommunity has constructed multiple popular benchmarks to study this problem, including OTB [76],\nUAV [56], NfS [36], TC-128 [45], NUS-PRO [41], GOT-10k [32], VOT [38], and TrackingNet [57].\n',
   7,
   0),
  2,
  100.0)]

## Get all citation numbers in the text and the corresponding links

In [10]:
def get_matching_links(matched_block, matched_page):
    # get the matched region
    matched_bbox = fitz.Rect(matched_block[:4])

    # get the citation links
    matched_links = []
    
    for link in doc[matched_page].get_links():
        if link['kind'] == 4:   # internal links
            link_bbox = link['from']
            if matched_bbox.intersects(link_bbox):
                link['from_page'] = matched_page
                matched_links.append(link)

    return matched_links

In [11]:
matched_links = []

for match in matches:
    matched_links.extend(get_matching_links(match[0], match[1]))

In [12]:
matched_links

[{'kind': 4,
  'xref': 71,
  'from': Rect(339.79998779296875, 373.8089904785156, 351.7550048828125, 382.6549987792969),
  'page': 11,
  'to': Point(108.0, 283.496),
  'zoom': 0.0,
  'nameddest': 'cite.milan2016mot16',
  'id': '',
  'from_page': 2},
 {'kind': 4,
  'xref': 72,
  'from': Rect(448.7869873046875, 373.8089904785156, 460.74200439453125, 382.6549987792969),
  'page': 10,
  'to': Point(108.0, 546.128),
  'zoom': 0.0,
  'nameddest': 'cite.Geiger2012CVPR',
  'id': '',
  'from_page': 2},
 {'kind': 4,
  'xref': 73,
  'from': Rect(463.6969909667969, 373.8089904785156, 475.65301513671875, 382.6549987792969),
  'page': 11,
  'to': Point(108.0, 411.277),
  'zoom': 0.0,
  'nameddest': 'cite.Luiten2020IJCV',
  'id': '',
  'from_page': 2},
 {'kind': 4,
  'xref': 74,
  'from': Rect(237.3040008544922, 384.7179870605469, 249.25900268554688, 393.56500244140625),
  'page': 9,
  'to': Point(108.0, 194.336),
  'zoom': 0.0,
  'nameddest': 'cite.dave2020tao',
  'id': '',
  'from_page': 2},
 {'kind

Get citation numbers for each each link.

Here we also filter out the citation links that are not part of the original extract.

In [13]:
matched_links_filtered = []

# page = doc[matched_page]
for link in matched_links:
    # keep only citations, not equations and figures
    if not link['nameddest'].startswith('cite.'):
        continue
        
    citation_num = doc[link['from_page']].get_text('text', clip=link['from'])
    print(link)
    print(citation_num)
    citation_num = re.findall(r'\d+', citation_num)

    if len(citation_num) == 0:
        continue
        
    citation_num = citation_num[0]

    if citation_num not in extract:
        continue
    
    link['citation_number'] = citation_num
    matched_links_filtered.append(link)

{'kind': 4, 'xref': 71, 'from': Rect(339.79998779296875, 373.8089904785156, 351.7550048828125, 382.6549987792969), 'page': 11, 'to': Point(108.0, 283.496), 'zoom': 0.0, 'nameddest': 'cite.milan2016mot16', 'id': '', 'from_page': 2}
[54]

{'kind': 4, 'xref': 72, 'from': Rect(448.7869873046875, 373.8089904785156, 460.74200439453125, 382.6549987792969), 'page': 10, 'to': Point(108.0, 546.128), 'zoom': 0.0, 'nameddest': 'cite.Geiger2012CVPR', 'id': '', 'from_page': 2}
[25,

{'kind': 4, 'xref': 73, 'from': Rect(463.6969909667969, 373.8089904785156, 475.65301513671875, 382.6549987792969), 'page': 11, 'to': Point(108.0, 411.277), 'zoom': 0.0, 'nameddest': 'cite.Luiten2020IJCV', 'id': '', 'from_page': 2}
50]

{'kind': 4, 'xref': 74, 'from': Rect(237.3040008544922, 384.7179870605469, 249.25900268554688, 393.56500244140625), 'page': 9, 'to': Point(108.0, 194.336), 'zoom': 0.0, 'nameddest': 'cite.dave2020tao', 'id': '', 'from_page': 2}
[15]

{'kind': 4, 'xref': 75, 'from': Rect(488.5539855957031, 

In [14]:
[m['citation_number'] for m in matched_links_filtered]

['54', '25', '50', '15', '76', '56', '36', '45', '41', '32', '38', '57']

## Get the references for these citations

In [15]:
matched_references = []

for link in matched_links_filtered:
    linked_page = doc.load_page(link['page'])
    text_blocks = linked_page.get_text("blocks")
    citation_num = link['citation_number']
    num_pat = r'\b' + citation_num + r'\b'
    
    for text in text_blocks:
        # citation number should be present in the initial section of the reference
        # if citation_num in text[4][:15]:
        if re.search(num_pat, text[4][:15]):
            matched_references.append(text[4].strip())

In [16]:
matched_references = list(map(lambda x: x.replace('\n', ' '), matched_references))
matched_references

['[54] Anton Milan, Laura Leal-Taixé, Ian Reid, Stefan Roth, and Konrad Schindler. Mot16: A benchmark for multi-object tracking. arXiv preprint arXiv:1603.00831, 2016. 3',
 '[25] Andreas Geiger, Philip Lenz, and Raquel Urtasun. Are we ready for autonomous driving? the kitti vision benchmark suite. In Conference on Computer Vision and Pattern Recognition (CVPR), 2012. 3',
 '[50] Jonathon Luiten, Aljosa Osep, Patrick Dendorfer, Philip Torr, Andreas Geiger, Laura Leal-Taixe, and Bastian Leibe. Hota: A higher order metric for evaluating multi-object tracking. International Journal of Computer Vision (IJCV), 2020. 3',
 '[15] Achal Dave, Tarasha Khurana, Pavel Tokmakov, Cordelia Schmid, and Deva Ramanan. Tao: A large-scale benchmark for tracking any object. In European conference on computer vision, pages 436–454. Springer, 2020. 3, 4',
 '[76] Yi Wu, Jongwoo Lim, and Ming-Hsuan Yang. Online object tracking: A benchmark. In Proceedings of the IEEE conference on computer vision and pattern rec

## Format the references

Extract clean attributes from the references. This will make the searches more reliable and accurate.

Some references:

https://anystyle.io/   - Written in ruby, present as cli and web api.

https://pypi.org/project/refextract/

### anystyle.io

To avoid setting up ruby and using the libraries. I had to setup my own simple ruby server locally on docker, with some simple sinatra code.

The following section would work once the container is running.

In [17]:
def get_title_from_reftext(reftext, min_title_len=15):
    reftext = reftext.encode("utf-8")
    response = requests.post('http://localhost:4567/parse', headers={"Content-Type": "text/plain"},
                        data = reftext)
    parsed_data = response.json()

    title = parsed_data[0]['title']
    title = ' '.join(title)

    # date = parsed_data[0]['date']
    # date = ' '.join(date)
    
    assert len(title) >= min_title_len

    # title = title + " " + date

    return title

In [18]:
matched_references_title = []

for reftext in tqdm(matched_references):
    title = get_title_from_reftext(reftext)
    matched_references_title.append(title)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.26it/s]


In [19]:
matched_references_title

['Mot16: A benchmark for multi-object tracking',
 'Are we ready for autonomous driving? the kitti vision benchmark suite',
 'Hota: A higher order metric for evaluating multi-object tracking',
 'Tao: A large-scale benchmark for tracking any object',
 'Online object tracking: A benchmark',
 'A benchmark and simulator for uav tracking',
 'Need for speed: A benchmark for higher frame rate object tracking',
 'Encoding color information for visual tracking: Algorithms and benchmark',
 'Nus-pro: A new visual tracking challenge',
 'Got-10k: A large high-diversity benchmark for generic object tracking in the wild',
 'A novel performance evaluation methodology for single-target trackers',
 'Trackingnet: A large-scale dataset and benchmark for object tracking in the wild']

## Get the metadata of these references

We will use external services to query for these reference texts and get the relevant metadata.

### Observations

* using scholarly (which uses google scholar) posed a lot of challenges in networking but worked well, particularly in directing getting the pdf.
* using habanero works well for a lot of cases, but fails for a lot of arxiv papers
* In a lot of the services, using the wrong year (seems to be common with arxiv - conference mismatches) completely messes up the results
* semantic scholar works well, but sometimes can't show pdfs, especially when there is an arxiv paper. I guess pre-prints are not exactly the open-access version of the published paper. But for our purposes it should be good enough.

### Semantic scholar

I've requested the API key

In [20]:
from semanticscholar import SemanticScholar
s2_api_key = 'WWxz8zHVUm6DWzkmw6ZSd3eA94kWbbX46Zl5jR11'
sch = SemanticScholar(api_key=s2_api_key, timeout=3)

In [21]:
matched_references_meta = []

for ref in tqdm(matched_references_title):
    results = sch.search_paper(ref, limit=1, 
                               fields=['title', 'paperId', 'externalIds', 'openAccessPdf'])
    meta = results[0]
    matched_references_meta.append(meta.raw_data)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:13<00:00,  6.11s/it]


In [22]:
[m['title'] for m in matched_references_meta]

['MOT16: A Benchmark for Multi-Object Tracking',
 'Are we ready for autonomous driving? The KITTI vision benchmark suite',
 'HOTA: A Higher Order Metric for Evaluating Multi-object Tracking',
 'TAO: A Large-Scale Benchmark for Tracking Any Object',
 'Online Object Tracking: A Benchmark',
 'A Benchmark and Simulator for UAV Tracking',
 'Need for Speed: A Benchmark for Higher Frame Rate Object Tracking',
 'Encoding color information for visual tracking: Algorithms and benchmark',
 'NUS-PRO: A New Visual Tracking Challenge',
 'GOT-10k: A Large High-Diversity Benchmark for Generic Object Tracking in the Wild',
 'A Novel Performance Evaluation Methodology for Single-Target Trackers',
 'TrackingNet: A Large-Scale Dataset and Benchmark for Object Tracking in the Wild']

In [23]:
matched_references_title

['Mot16: A benchmark for multi-object tracking',
 'Are we ready for autonomous driving? the kitti vision benchmark suite',
 'Hota: A higher order metric for evaluating multi-object tracking',
 'Tao: A large-scale benchmark for tracking any object',
 'Online object tracking: A benchmark',
 'A benchmark and simulator for uav tracking',
 'Need for speed: A benchmark for higher frame rate object tracking',
 'Encoding color information for visual tracking: Algorithms and benchmark',
 'Nus-pro: A new visual tracking challenge',
 'Got-10k: A large high-diversity benchmark for generic object tracking in the wild',
 'A novel performance evaluation methodology for single-target trackers',
 'Trackingnet: A large-scale dataset and benchmark for object tracking in the wild']

## Access the PDFs from metadata

### Observations

* When a DoI is present, open access button is a good API to get the pdf url from DOI. However, it is not perfect.
* Open access pdf search is integrated directly into semantic scholar. This sometimes gets the pdf. If it is an arxiv paper, we can use the arxiv id to get the pdfs directly.

In [24]:
for meta in matched_references_meta:
    if meta['openAccessPdf'] is not None:
        meta['pdf_url'] = meta['openAccessPdf']['url']
    elif 'ArXiv' in meta['externalIds']:
        meta['pdf_url'] = f"https://arxiv.org/pdf/{meta['externalIds']['ArXiv']}.pdf"
    else:
        meta['pdf_url'] = None

In [25]:
[m['pdf_url'] for m in matched_references_meta]

['https://arxiv.org/pdf/1603.00831.pdf',
 None,
 'https://link.springer.com/content/pdf/10.1007/s11263-020-01375-2.pdf',
 'https://arxiv.org/pdf/2005.10356',
 'http://faculty.ucmerced.edu/mhyang/papers/cvpr13_benchmark.pdf',
 None,
 'https://arxiv.org/pdf/1703.05884',
 None,
 None,
 'https://arxiv.org/pdf/1810.11981.pdf',
 'http://pure-oai.bham.ac.uk/ws/files/26149101/performance_evaluation_methodology.pdf',
 'https://arxiv.org/pdf/1803.10794']

## Download the PDFs

In [26]:
paperdir = '/home/surya/NEU/CS5100 FAI/Project/pdfreader/python/papers'

In [27]:
from pathlib import Path

for meta in tqdm(matched_references_meta):
    paperId = meta['paperId']
    pdf = meta['pdf_url']

    if pdf is None:
        continue        

    file = Path(f"{paperdir}/{paperId}.pdf")
    
    # download
    response = requests.get(pdf)
    file.write_bytes(response.content)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:05<00:00,  5.43s/it]


# Problems

Selection across paragraphs, pages.

Paragraphs broken by images and tables.

~~Above problems require using multiple block matches, right now only using the top match.~~

Make it work for name-based citation

Make it work for 2-column references?