# Citation Extractor

Given a pdf file, an extract of text in the file, fetch all the papers cited in the extract.

In [1]:
import fitz  # PyMuPDF
import re
from rapidfuzz import fuzz
from tqdm import tqdm
from time import sleep
import json
import requests

In [2]:
datadir = "/home/surya/NEU/CS5100 FAI/Project/pdfreader/"
file = datadir + "test3.pdf"
doc = fitz.open(file)

In [3]:
extract = """
This is the same objective optimized in prior works [49, 38, 1, 26] using the DPO-equivalent reward
for the reward class of rφ . In this setting, we can interpret the normalization term in f (rφ, πref , β)
as the soft value function of the reference policy πref . While this term does not affect the optimal
solution, without it, the policy gradient of the objective could have high variance, making learning
unstable. We can accommodate for the normalization term using a learned value function, but that
can also be difficult to optimize. Alternatively, prior works have normalized rewards using a human
completion baseline, essentially a single sample Monte-Carlo estimate of the normalizing term. In
contrast the DPO reparameterization yields a reward function that does not require any baselines.
"""

extract = extract.strip()

In [4]:
extract = """
We outperform state-of-the-arts in multiple datasets, including our novel
MPHOI-72 dataset, the single-human HOI CAD-120 [24] dataset, and the two-
Multi-person Human-object Interaction Recognition
 3
hand Bimanual Actions [9] dataset. We also extensively evaluate core compo-
nents of 2G-GCN in ablation studies. Our main contributions are as follows:
– We propose a novel geometry-informed 2G-GCN network for HOI recog-
nition in videos. The network consists of a two-level graph structure that
models geometric features between human and object, together with the
corresponding visual features.
– We present the novel problem of MPHOI in videos with a new MPHOI-72
dataset, showcasing new challenges that cannot be directly resolved by ex-
isting methods. The source code and dataset are made public1.
– We outperform state-of-the-art HOI recognition networks in our MPHOI-72
dataset, the CAD-120 [24] dataset and the Bimanual Actions [9] dataset.
"""

extract = extract.strip()

## Find the extract text in the document

In [5]:
THRESHOLD = 92

In [6]:
matches = []

for page_num in tqdm(range(len(doc)), leave=False):
    page = doc.load_page(page_num)  # load the current page
    text_blocks = page.get_text_blocks()  # get a list of links on the current page
    for block in text_blocks:
        text = block[4]

        match_score = fuzz.partial_ratio(extract, text)

        if match_score >= THRESHOLD:
            matches.append((block, page_num, match_score))

matches = sorted(matches, key=lambda x: x[2], reverse=True)

                                                                                                                                                         

Remove matches that are too small

In [7]:
MINTEXTLEN = 10

In [8]:
matches = filter(lambda x: len(x[0][4]) > MINTEXTLEN, matches)
matches = list(matches)

In [9]:
matches

[((134.7650146484375,
   118.32608795166016,
   480.59674072265625,
   140.2436981201172,
   'hand Bimanual Actions [9] dataset. We also extensively evaluate core compo-\nnents of 2G-GCN in ablation studies. Our main contributions are as follows:\n',
   1,
   0),
  2,
  100.0),
 ((140.9910125732422,
   154.87307739257812,
   480.6162414550781,
   200.7017059326172,
   '– We propose a novel geometry-informed 2G-GCN network for HOI recog-\nnition in videos. The network consists of a two-level graph structure that\nmodels geometric features between human and object, together with the\ncorresponding visual features.\n',
   2,
   0),
  2,
  100.0),
 ((140.9910125732422,
   203.22909545898438,
   480.6263122558594,
   237.1017303466797,
   '– We present the novel problem of MPHOI in videos with a new MPHOI-72\ndataset, showcasing new challenges that cannot be directly resolved by ex-\nisting methods. The source code and dataset are made public1.\n',
   3,
   0),
  2,
  100.0),
 ((140.9909973

## Get all citation numbers in the text and the corresponding links

In [10]:
def get_matching_links(matched_block, matched_page):
    # get the matched region
    matched_bbox = fitz.Rect(matched_block[:4])

    # get the citation links
    matched_links = []
    
    for link in doc[matched_page].get_links():
        if link['kind'] == 4:   # internal links
            link_bbox = link['from']
            if matched_bbox.intersects(link_bbox):
                link['from_page'] = matched_page
                matched_links.append(link)

    return matched_links

In [11]:
matched_links = []

for match in matches:
    matched_links.extend(get_matching_links(match[0], match[1]))

In [12]:
matched_links

[{'kind': 4,
  'xref': 190,
  'from': Rect(245.1490020751953, 118.38201904296875, 252.1230010986328, 126.79498291015625),
  'page': 14,
  'to': Point(134.765, 349.735),
  'zoom': 0.0,
  'nameddest': 'cite.dreher2020learning',
  'id': '',
  'from_page': 2},
 {'kind': 4,
  'xref': 191,
  'from': Rect(418.8659973144531, 225.5050048828125, 425.3280029296875, 237.54498291015625),
  'page': 2,
  'to': Point(144.727, 140.037),
  'zoom': 0.0,
  'nameddest': 'Hfootnote.1',
  'id': '',
  'from_page': 2},
 {'kind': 4,
  'xref': 192,
  'from': Rect(252.18600463867188, 251.6400146484375, 264.1409912109375, 260.052001953125),
  'page': 15,
  'to': Point(134.765, 418.009),
  'zoom': 0.0,
  'nameddest': 'cite.koppula2013learning',
  'id': '',
  'from_page': 2},
 {'kind': 4,
  'xref': 193,
  'from': Rect(423.73699951171875, 251.6400146484375, 430.71099853515625, 260.052001953125),
  'page': 14,
  'to': Point(134.765, 349.735),
  'zoom': 0.0,
  'nameddest': 'cite.dreher2020learning',
  'id': '',
  'from

Get citation numbers for each each link.

Here we also filter out the citation links that are not part of the original extract.

In [13]:
matched_links_filtered = []

# page = doc[matched_page]
for link in matched_links:
    # keep only citations, not equations and figures
    if not link['nameddest'].startswith('cite.'):
        continue
        
    citation_num = doc[link['from_page']].get_text('text', clip=link['from'])
    print(link)
    print(citation_num)
    citation_num = re.findall(r'\d+', citation_num)

    if len(citation_num) == 0:
        continue
        
    citation_num = citation_num[0]

    if citation_num not in extract:
        continue
    
    link['citation_number'] = citation_num
    matched_links_filtered.append(link)

{'kind': 4, 'xref': 190, 'from': Rect(245.1490020751953, 118.38201904296875, 252.1230010986328, 126.79498291015625), 'page': 14, 'to': Point(134.765, 349.735), 'zoom': 0.0, 'nameddest': 'cite.dreher2020learning', 'id': '', 'from_page': 2}
[9]

{'kind': 4, 'xref': 192, 'from': Rect(252.18600463867188, 251.6400146484375, 264.1409912109375, 260.052001953125), 'page': 15, 'to': Point(134.765, 418.009), 'zoom': 0.0, 'nameddest': 'cite.koppula2013learning', 'id': '', 'from_page': 2}
[24]

{'kind': 4, 'xref': 193, 'from': Rect(423.73699951171875, 251.6400146484375, 430.71099853515625, 260.052001953125), 'page': 14, 'to': Point(134.765, 349.735), 'zoom': 0.0, 'nameddest': 'cite.dreher2020learning', 'id': '', 'from_page': 2}
[9]



Filter out references that do have a page link

In [14]:
new_filtered = []
for match in matched_links_filtered:
    if 'page' not in match:
        print("Page link not found for", match["citation_number"])
        continue
    
    new_filtered.append(match)

matched_links_filtered = new_filtered

Remove duplicates

In [15]:
unique_matches = {}

for match in matched_links_filtered:
    citation_num = match['citation_number']

    # already exist, duplicate - keep if a link has more attributes than an existing one
    if citation_num in unique_matches and len(match.keys()) < len(unique_matches[citation_num]):
        continue

    # doesn't exist
    unique_matches[citation_num] = match

matched_links_filtered = list(unique_matches.values())

In [16]:
[m['citation_number'] for m in matched_links_filtered]

['9', '24']

## Get the references for these citations

In [17]:
matched_references = []

for link in matched_links_filtered:
    linked_page = doc.load_page(link['page'])
    text_blocks = linked_page.get_text("blocks")
    citation_num = link['citation_number']
    num_pat = r'\b' + citation_num + r'\b'
    
    for text in text_blocks:
        # citation number should be present in the initial section of the reference
        # if citation_num in text[4][:15]:
        if re.search(num_pat, text[4][:15]):
            matched_references.append(text[4].strip())

In [18]:
matched_references = list(map(lambda x: x.replace('\n', ' '), matched_references))
matched_references

['9. Dreher, C.R., W¨ achter, M., Asfour, T.: Learning object-action relations from bi- manual human demonstration using graph networks. IEEE Robotics and Automa- tion Letters 5(1), 187–194 (2020)',
 '24. Koppula, H.S., Gupta, R., Saxena, A.: Learning human activities and object affor- dances from rgb-d videos. The International Journal of Robotics Research 32(8), 951–970 (2013)']

## Format the references

Extract clean attributes from the references. This will make the searches more reliable and accurate.

Some references:

https://anystyle.io/   - Written in ruby, present as cli and web api.

https://pypi.org/project/refextract/

### anystyle.io

To avoid setting up ruby and using the libraries. I had to setup my own simple ruby server locally on docker, with some simple sinatra code.

The following section would work once the container is running.

In [19]:
def get_title_from_reftext(reftext, min_title_len=15):
    reftext = reftext.encode("utf-8")
    response = requests.post('http://localhost:4567/parse', headers={"Content-Type": "text/plain"},
                        data = reftext)
    parsed_data = response.json()

    title = parsed_data[0]['title']
    title = ' '.join(title)

    # date = parsed_data[0]['date']
    # date = ' '.join(date)
    
    assert len(title) >= min_title_len

    # title = title + " " + date

    return title

In [21]:
matched_references_title = []

for reftext in tqdm(matched_references):
    title = get_title_from_reftext(reftext)
    matched_references_title.append(title)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:05<00:00,  3.83it/s]


In [22]:
matched_references_title

['Scaling egocentric vision: The epic-kitchens dataset',
 'Ego4d: Around the world in 3,000 hours of egocentric video',
 'Discovering important people and objects for egocentric video summarization',
 'Detecting activities of daily living in first-person camera views',
 'Social interactions: A first-person perspective',
 'Epic-fusion: Audio-visual temporal binding for egocentric action recognition',
 'Ego-exo: Transferring visual representa- tions from third-person to first-person videos',
 'Temporal perception and prediction in ego-centric video',
 'What makes training multi-modal classification networks hard?',
 'Large-scale weakly-supervised pre-training for video action recognition',
 'When will you do what? - anticipating temporal occurrences of activities',
 'Rolling-unrolling lstms for action anticipation from first-person video',
 'Anticipative video transformer',
 'Summarization of egocentric videos: A comprehensive survey',
 'Story-driven summarization for egocentric video',


## Get the metadata of these references

We will use external services to query for these reference texts and get the relevant metadata.

### Observations

* using scholarly (which uses google scholar) posed a lot of challenges in networking but worked well, particularly in directing getting the pdf.
* using habanero works well for a lot of cases, but fails for a lot of arxiv papers
* In a lot of the services, using the wrong year (seems to be common with arxiv - conference mismatches) completely messes up the results
* semantic scholar works well, but sometimes can't show pdfs, especially when there is an arxiv paper. I guess pre-prints are not exactly the open-access version of the published paper. But for our purposes it should be good enough.

### Semantic scholar

I've requested the API key

In [23]:
from semanticscholar import SemanticScholar
s2_api_key = 'WWxz8zHVUm6DWzkmw6ZSd3eA94kWbbX46Zl5jR11'
sch = SemanticScholar(api_key=s2_api_key, timeout=3)

In [24]:
matched_references_meta = []

for ref in tqdm(matched_references_title):
    results = sch.search_paper(ref, limit=1, 
                               fields=['title', 'paperId', 'externalIds', 'openAccessPdf'])
    meta = results[0]
    matched_references_meta.append(meta.raw_data)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:46<00:00,  2.23s/it]


In [25]:
[m['title'] for m in matched_references_meta]

['Scaling Egocentric Vision: The EPIC-KITCHENS Dataset',
 'Ego4D: Around the World in 3,000 Hours of Egocentric Video',
 'Discovering important people and objects for egocentric video summarization',
 'Detecting activities of daily living in first-person camera views',
 'Social interactions: A first-person perspective',
 'EPIC-Fusion: Audio-Visual Temporal Binding for Egocentric Action Recognition',
 'Ego-Exo: Transferring Visual Representations from Third-person to First-person Videos',
 'Temporal Perception and Prediction in Ego-Centric Video',
 'What Makes Training Multi-Modal Classification Networks Hard?',
 'Large-Scale Weakly-Supervised Pre-Training for Video Action Recognition',
 'When will you do what? - Anticipating Temporal Occurrences of Activities',
 'Rolling-Unrolling LSTMs for Action Anticipation from First-Person Video',
 'Anticipative Video Transformer',
 'Summarization of Egocentric Videos: A Comprehensive Survey',
 'Story-Driven Summarization for Egocentric Video',
 '

In [26]:
matched_references_title

['Scaling egocentric vision: The epic-kitchens dataset',
 'Ego4d: Around the world in 3,000 hours of egocentric video',
 'Discovering important people and objects for egocentric video summarization',
 'Detecting activities of daily living in first-person camera views',
 'Social interactions: A first-person perspective',
 'Epic-fusion: Audio-visual temporal binding for egocentric action recognition',
 'Ego-exo: Transferring visual representa- tions from third-person to first-person videos',
 'Temporal perception and prediction in ego-centric video',
 'What makes training multi-modal classification networks hard?',
 'Large-scale weakly-supervised pre-training for video action recognition',
 'When will you do what? - anticipating temporal occurrences of activities',
 'Rolling-unrolling lstms for action anticipation from first-person video',
 'Anticipative video transformer',
 'Summarization of egocentric videos: A comprehensive survey',
 'Story-driven summarization for egocentric video',


## Access the PDFs from metadata

### Observations

* When a DoI is present, open access button is a good API to get the pdf url from DOI. However, it is not perfect.
* Open access pdf search is integrated directly into semantic scholar. This sometimes gets the pdf. If it is an arxiv paper, we can use the arxiv id to get the pdfs directly.

In [29]:
for meta in matched_references_meta:
    if meta['openAccessPdf'] is not None:
        meta['pdf_url'] = meta['openAccessPdf']['url']
    elif 'ArXiv' in meta['externalIds']:
        meta['pdf_url'] = f"https://arxiv.org/pdf/{meta['externalIds']['ArXiv']}"
    else:
        meta['pdf_url'] = None

In [30]:
[m['pdf_url'] for m in matched_references_meta]

['https://arxiv.org/pdf/1804.02748',
 'https://arxiv.org/pdf/2110.07058',
 'http://vision.cs.utexas.edu/projects/egocentric/egocentric_cvpr2012.pdf',
 None,
 'http://repository.gatech.edu/bitstreams/4f49dbef-a8e1-44f8-842b-72a7370d1751/download',
 'https://arxiv.org/pdf/1908.08498',
 'https://arxiv.org/pdf/2104.07905',
 None,
 'https://arxiv.org/pdf/1905.12681',
 'https://arxiv.org/pdf/1905.00561',
 'https://arxiv.org/pdf/1804.00892',
 'https://arxiv.org/pdf/2005.02190',
 'https://arxiv.org/pdf/2106.02036',
 None,
 'https://www.cs.utexas.edu/~grauman/papers/lu-grauman-cvpr2013.pdf',
 'http://arxiv.org/pdf/2209.13064',
 'https://arxiv.org/pdf/1911.10967',
 'https://arxiv.org/pdf/1603.07763',
 None,
 'https://arxiv.org/pdf/2108.13665',
 'https://link.springer.com/content/pdf/10.1007/s11263-022-01694-6.pdf']

## Download the PDFs

In [26]:
paperdir = '/home/surya/NEU/CS5100 FAI/Project/pdfreader/python/papers'

In [27]:
from pathlib import Path

for meta in tqdm(matched_references_meta):
    paperId = meta['paperId']
    pdf = meta['pdf_url']

    if pdf is None:
        continue        

    file = Path(f"{paperdir}/{paperId}.pdf")
    
    # download
    response = requests.get(pdf)
    file.write_bytes(response.content)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:05<00:00,  5.43s/it]


# Problems

Selection across paragraphs, pages.

Paragraphs broken by images and tables.

~~Above problems require using multiple block matches, right now only using the top match.~~

Make it work for name-based citation

Make it work for 2-column references?