In [7]:
import requests
from pathlib import Path
import arxiv
from nltk import edit_distance

In [2]:
def download_arxiv_by_id(output_directory:Path, arxiv_id:str):
    try:
        url = f"https://arxiv.org/pdf/{arxiv_id}" 
        response = requests.get(url)
        
        if response.status_code == 200:
            # Save PDF to file
            with open(output_directory/f"{arxiv_id}.pdf", 'wb') as f:
                f.write(response.content)
            print(f"Paper '{arxiv_id}' downloaded successfully.")
        else:
            print(f"Failed to download paper '{arxiv_id}': HTTP status code {response.status_code}")
            
    except Exception as e:
        print(f"Failed to download paper from {url}: {str(e)}")

In [8]:
def download_arxiv_by_title(output_directory:Path, paper_title:str):
    client = arxiv.Client()

    search = arxiv.Search(
        query = paper_title,
        max_results = 1,
        sort_by = arxiv.SortCriterion.Relevance
    )

    paper = next(arxiv.Client().results(search))

    paper.download_pdf(dirpath=output_directory, filename=f"{paper.get_short_id()}.pdf")

    # use Levenshtein distance to check paper retrieved has same title:
    title_match = edit_distance(paper.title, paper_title) < 5
    
    return title_match 

In [4]:
def download_arxiv_by_vector_query(output_directory:Path, query):
    """TODO: IMPLEMENT"""

In [27]:
download_arxiv(Path("."), "1605.08386v1")

Paper '1605.08386v1' downloaded successfully.


In [11]:
download_arxiv_by_title(Path("."), "Classification of flat bands according to the band-crossing singularity of Bloch wavefunctions")

True