# Citation Extractor

Given a pdf file, an extract of text in the file, fetch all the papers cited in the extract.

In [1]:
import fitz  # PyMuPDF
import re
from rapidfuzz import fuzz
from tqdm import tqdm
from time import sleep
import json
import requests

In [2]:
datadir = "/home/surya/NEU/CS5100 FAI/Project/pdfreader/"
file = datadir + "test2.pdf"
doc = fitz.open(file)

In [3]:
extract = """
This is the same objective optimized in prior works [49, 38, 1, 26] using the DPO-equivalent reward
for the reward class of rφ . In this setting, we can interpret the normalization term in f (rφ, πref , β)
as the soft value function of the reference policy πref . While this term does not affect the optimal
solution, without it, the policy gradient of the objective could have high variance, making learning
unstable. We can accommodate for the normalization term using a learned value function, but that
can also be difficult to optimize. Alternatively, prior works have normalized rewards using a human
completion baseline, essentially a single sample Monte-Carlo estimate of the normalizing term. In
contrast the DPO reparameterization yields a reward function that does not require any baselines.
"""

extract = extract.strip()

## Find the extract text in the document

In [4]:
THRESHOLD = 95

In [5]:
matches = []

for page_num in tqdm(range(len(doc)), leave=False):
    page = doc.load_page(page_num)  # load the current page
    text_blocks = page.get_text_blocks()  # get a list of links on the current page
    for block in text_blocks:
        text = block[4]

        match_score = fuzz.partial_ratio(extract, text)

        if match_score >= THRESHOLD:
            matches.append((block, page_num, match_score))

matches = sorted(matches, key=lambda x: x[2], reverse=True)

                                                                                                                                                         

Remove matches that are too small

In [6]:
MINTEXTLEN = 10

In [7]:
matches = filter(lambda x: len(x[0][4]) > MINTEXTLEN, matches)
matches = list(matches)

In [8]:
matches

[((107.69100189208984,
   634.4305419921875,
   505.1564636230469,
   722.7994995117188,
   'This is the same objective optimized in prior works [49, 38, 1, 26] using the DPO-equivalent reward\nfor the reward class of rϕ. In this setting, we can interpret the normalization term in f(rϕ, πref, β)\nas the soft value function of the reference policy πref. While this term does not affect the optimal\nsolution, without it, the policy gradient of the objective could have high variance, making learning\nunstable. We can accommodate for the normalization term using a learned value function, but that\ncan also be difficult to optimize. Alternatively, prior works have normalized rewards using a human\ncompletion baseline, essentially a single sample Monte-Carlo estimate of the normalizing term. In\ncontrast the DPO reparameterization yields a reward function that does not require any baselines.\n',
   26,
   0),
  5,
  99.24812030075188)]

For now keep the top match

In [9]:
matched_block = matches[0][0]
matched_page = matches[0][1]

## Get all citation numbers in the text and the corresponding links

In [10]:
matched_bbox = fitz.Rect(matched_block[:4])

Get citing links

In [11]:
matched_links = []

for link in doc[matched_page].get_links():
    if link['kind'] == 4:   # internal links
        link_bbox = link['from']
        if matched_bbox.intersects(link_bbox):
            matched_links.append(link)

Get citation numbers for each each link.

Here we also filter out the citation links that are not part of the original extract.

In [12]:
matched_links_filtered = []

page = doc[matched_page]
for link in matched_links:
    # keep only citations, not equations and figures
    if not link['nameddest'].startswith('cite.'):
        continue
        
    citation_num = page.get_text('text', clip=link['from'])
    citation_num = re.findall(r'\d+', citation_num)[0]

    if citation_num not in extract:
        continue
    
    link['citation_number'] = citation_num
    matched_links_filtered.append(link)

In [13]:
[m['citation_number'] for m in matched_links_filtered]

['49', '38', '1', '26']

## Get the references for these citations

In [14]:
matched_references = []

for link in matched_links_filtered:
    linked_page = doc.load_page(link['page'])
    text_blocks = linked_page.get_text("blocks")
    citation_num = link['citation_number']
    num_pat = r'\b' + citation_num + r'\b'
    
    for text in text_blocks:
        # citation number should be present in the initial section of the reference
        # if citation_num in text[4][:15]:
        if re.search(num_pat, text[4][:15]):
            matched_references.append(text[4].strip())

In [15]:
matched_references = list(map(lambda x: x.replace('\n', ' '), matched_references))
matched_references

['[49] D. M. Ziegler, N. Stiennon, J. Wu, T. B. Brown, A. Radford, D. Amodei, P. Christiano, and G. Irving. Fine-tuning language models from human preferences, 2020.',
 '[38] N. Stiennon, L. Ouyang, J. Wu, D. M. Ziegler, R. Lowe, C. Voss, A. Radford, D. Amodei, and P. Christiano. Learning to summarize from human feedback, 2022.',
 '[1] Y. Bai, A. Jones, K. Ndousse, A. Askell, A. Chen, N. DasSarma, D. Drain, S. Fort, D. Ganguli, T. Henighan, N. Joseph, S. Kadavath, J. Kernion, T. Conerly, S. El-Showk, N. Elhage, Z. Hatfield- Dodds, D. Hernandez, T. Hume, S. Johnston, S. Kravec, L. Lovitt, N. Nanda, C. Olsson, D. Amodei, T. Brown, J. Clark, S. McCandlish, C. Olah, B. Mann, and J. Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback, 2022.',
 '[26] L. Ouyang, J. Wu, X. Jiang, D. Almeida, C. Wainwright, P. Mishkin, C. Zhang, S. Agarwal, K. Slama, A. Ray, J. Schulman, J. Hilton, F. Kelton, L. Miller, M. Simens, A. Askell, P. Welinder, P. F. Chris

## Format the references

Extract clean attributes from the references. This will make the searches more reliable and accurate.

Some references:

https://anystyle.io/   - Written in ruby, present as cli and web api.

https://pypi.org/project/refextract/

### anystyle.io

To avoid setting up ruby and using the libraries. I had to setup my own simple ruby server locally on docker, with some simple sinatra code.

The following section would work once the container is running.

In [16]:
def get_title_from_reftext(reftext, min_title_len=15):
    reftext = reftext.encode("utf-8")
    response = requests.post('http://localhost:4567/parse', headers={"Content-Type": "text/plain"},
                        data = reftext)
    parsed_data = response.json()

    title = parsed_data[0]['title']
    title = ' '.join(title)

    # date = parsed_data[0]['date']
    # date = ' '.join(date)
    
    assert len(title) >= min_title_len

    # title = title + " " + date

    return title

In [17]:
matched_references_title = []

for reftext in tqdm(matched_references):
    title = get_title_from_reftext(reftext)
    matched_references_title.append(title)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.07it/s]


In [18]:
matched_references_title

['Fine-tuning language models from human preferences',
 'Learning to summarize from human feedback',
 'Training a helpful and harmless assistant with reinforcement learning from human feedback',
 'Training language models to follow instructions with human feedback']

## Get the metadata of these references

We will use external services to query for these reference texts and get the relevant metadata.

### Observations

* using scholarly (which uses google scholar) posed a lot of challenges in networking but worked well, particularly in directing getting the pdf.
* using habanero works well for a lot of cases, but fails for a lot of arxiv papers
* In a lot of the services, using the wrong year (seems to be common with arxiv - conference mismatches) completely messes up the results
* semantic scholar works well, but sometimes can't show pdfs, especially when there is an arxiv paper. I guess pre-prints are not exactly the open-access version of the published paper. But for our purposes it should be good enough.

### Semantic scholar

I've requested the API key

In [19]:
from semanticscholar import SemanticScholar
s2_api_key = 'WWxz8zHVUm6DWzkmw6ZSd3eA94kWbbX46Zl5jR11'
sch = SemanticScholar(api_key=s2_api_key, timeout=3)

In [20]:
matched_references_meta = []

for ref in tqdm(matched_references_title):
    results = sch.search_paper(ref, limit=1, 
                               fields=['title', 'paperId', 'externalIds', 'openAccessPdf'])
    meta = results[0]
    matched_references_meta.append(meta.raw_data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:34<00:00,  8.60s/it]


In [21]:
[m['title'] for m in matched_references_meta]

['Fine-Tuning Language Models from Human Preferences',
 'Learning to summarize from human feedback',
 'Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback',
 'Training language models to follow instructions with human feedback']

In [22]:
matched_references_title

['Fine-tuning language models from human preferences',
 'Learning to summarize from human feedback',
 'Training a helpful and harmless assistant with reinforcement learning from human feedback',
 'Training language models to follow instructions with human feedback']

## Access the PDFs from metadata

### Observations

* When a DoI is present, open access button is a good API to get the pdf url from DOI. However, it is not perfect.
* Open access pdf search is integrated directly into semantic scholar. This sometimes gets the pdf. If it is an arxiv paper, we can use the arxiv id to get the pdfs directly.

In [23]:
for meta in matched_references_meta:
    if meta['openAccessPdf'] is not None:
        meta['pdf_url'] = meta['openAccessPdf']['url']
    elif 'ArXiv' in meta['externalIds']:
        meta['pdf_url'] = f"https://arxiv.org/pdf/{meta['externalIds']['ArXiv']}.pdf"
    else:
        meta['pdf_url'] = None

In [24]:
[m['pdf_url'] for m in matched_references_meta]

['https://arxiv.org/pdf/1909.08593.pdf',
 'https://arxiv.org/pdf/2009.01325.pdf',
 'http://arxiv.org/pdf/2204.05862',
 'https://arxiv.org/pdf/2203.02155.pdf']

## Download the PDFs

In [25]:
paperdir = '/home/surya/NEU/CS5100 FAI/Project/pdfreader/python/papers'

In [26]:
from pathlib import Path

for meta in tqdm(matched_references_meta):
    paperId = meta['paperId']
    pdf = meta['pdf_url']

    if pdf is None:
        continue        

    file = Path(f"{paperdir}/{paperId}.pdf")
    
    # download
    response = requests.get(pdf)
    file.write_bytes(response.content)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.58it/s]


# Problems

Selection across paragraphs, pages.

Paragraphs broken by images and tables.

Above problems require using multiple block matches, right now only using the top match.

Make it work for name-based citation

Make it work for 2-column references?