# Arxiv Crawler

In [245]:
# https://arxiv.org/help/api/index
# https://arxiv.org/help/api/user-manual

import xml.etree.ElementTree as ET
from urllib.request import urlopen

# http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1

base_url = 'http://export.arxiv.org/api/query?search_query='
categroy = "stat.ML"
subject = "deep"
start = 10
max_results = 3

url = base_url + 'all:' + subject + '&start=' + str(start) + '&max_results=' + str(max_results)
print(url)
data = urlopen(url).read()
parsed = ET.fromstring(data)

if not int(parsed.find("{http://a9.com/-/spec/opensearch/1.1/}totalResults").text) != 0:
    print("NO RESULTS - ABORT NOW")

In [259]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

XML_ENTRIES = "{http://www.w3.org/2005/Atom}entry"
XML_ID = "{http://www.w3.org/2005/Atom}id"
XML_UPDATED = "{http://www.w3.org/2005/Atom}updated"
XML_PUBLISHED = "{http://www.w3.org/2005/Atom}published"
XML_TITLE = "{http://www.w3.org/2005/Atom}title"
XML_SUMMARY = "{http://www.w3.org/2005/Atom}summary"

XML_AUTHOR = "{http://www.w3.org/2005/Atom}author"
XML_AUTHOR_NAME = "{http://www.w3.org/2005/Atom}name"
XML_AUTHOR_AFFILIATION = "{http://arxiv.org/schemas/atom}affiliation"

XML_DOI = "{http://arxiv.org/schemas/atom}doi"
XML_JOURNAL_REF = "{http://arxiv.org/schemas/atom}journal_ref"
XML_LINKS = "{http://www.w3.org/2005/Atom}link"

XML_PRIMARY_CATEGORY = "{http://arxiv.org/schemas/atom}primary_category"
XML_CATEGORY = "{http://www.w3.org/2005/Atom}category"




entries_list = []
entries_root = parsed.findall("./"+XML_ENTRIES)

if parsed.find("./"+XML_ENTRIES):
    for entry in entries_root:
        entry_list = []
        
        entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_ID).text) # Entry ID
        entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_UPDATED).text) # Entry Update Date
        entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_PUBLISHED).text) # Entry Published Date
        entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_TITLE).text) # Entry Title
        entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_SUMMARY).text) # Entry Summary
        
        # List of entry authors and their affiliation is any
        authors_name_list = []
        authors_affiliation_list = []
        for author in entry.findall("./"+XML_AUTHOR):
            authors_name_list.append(author.find("./"+XML_AUTHOR_NAME).text)
            if author.find("./"+XML_AUTHOR_AFFILIATION) != None:
                authors_affiliation_list.append(author.find("./"+XML_AUTHOR_AFFILIATION).text)
            else:
                authors_affiliation_list.append("")
        entry_list.append(authors_name_list)
        entry_list.append(authors_affiliation_list)
        
        # Entry DOI
        if (parsed.find("./"+XML_ENTRIES+"/"+XML_DOI) != None): 
            entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_DOI).text)
        
        # Entry Journal Ref
        if (parsed.find("./"+XML_ENTRIES+"/"+XML_JOURNAL_REF) != None): 
            entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_JOURNAL_REF).text)
        
        # Entry PDF link
        for links in entry.findall("./"+XML_LINKS):
            if (links.attrib.get("title") == "pdf"): entry_list.append(links.attrib.get("href"))
                
        # Entry primary category
        entry_list.append(parsed.find("./"+XML_ENTRIES+"/"+XML_PRIMARY_CATEGORY).attrib.get("term"))
        
        # Entry Categories
        authors_categories_list = []
        for category in entry.findall("./"+XML_CATEGORY):
            authors_categories_list.append(category.attrib.get("term"))
        entry_list.append(authors_categories_list)
        
        # Build entries list
        entries_list.append(entry_list) # Append to main entry list
        
pp.pprint(entries_list)

[   [   'http://arxiv.org/abs/1602.03332v1',
        '2016-02-10T11:40:14Z',
        '2016-02-10T11:40:14Z',
        'Polynomial Depth, Highness and Lowness for E',
        '  We study the relations between the notions of highness, lowness and '
        'logical\n'
        'depth in the setting of complexity theory. We introduce a new notion '
        'of\n'
        'polynomial depth based on time bounded Kolmogorov complexity. We show '
        'our\n'
        'polynomial depth notion satisfies all basic logical depth properties, '
        'namely\n'
        'neither sets in P nor sets random for EXP are polynomial deep, and '
        'only\n'
        'polynomial deep sets can polynomially Turing compute a polynomial '
        'deep set. We\n'
        'prove all EXP- complete sets are poly-deep, and under the assumption '
        'that NP\n'
        'does not have p-measure zero, then NP contains a polynomial deep set. '
        'We show\n'
        'that every high set for E contains 