# Arxiv Crawler

In [1]:
import pandas as pd
import time
import os

In [2]:
# https://arxiv.org/help/api/index
# https://arxiv.org/help/api/user-manual

import xml.etree.ElementTree as ET
from urllib.request import urlopen

# http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1

base_url = 'http://export.arxiv.org/api/query?search_query='
categroy = "stat.ML"
subject = "deep"
start = 10
max_results = 1000

url = base_url + 'all:' + subject + '&start=' + str(start) + '&max_results=' + str(max_results)
print(url)
data = urlopen(url).read()
parsed = ET.fromstring(data)

if not int(parsed.find("{http://a9.com/-/spec/opensearch/1.1/}totalResults").text) != 0:
    print("NO RESULTS - ABORT NOW")

http://export.arxiv.org/api/query?search_query=all:deep&start=10&max_results=1000


In [3]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

XML_ENTRIES = "{http://www.w3.org/2005/Atom}entry"
XML_ID = "{http://www.w3.org/2005/Atom}id"
XML_UPDATED = "{http://www.w3.org/2005/Atom}updated"
XML_PUBLISHED = "{http://www.w3.org/2005/Atom}published"
XML_TITLE = "{http://www.w3.org/2005/Atom}title"
XML_SUMMARY = "{http://www.w3.org/2005/Atom}summary"

XML_AUTHOR = "{http://www.w3.org/2005/Atom}author"
XML_AUTHOR_NAME = "{http://www.w3.org/2005/Atom}name"
XML_AUTHOR_AFFILIATION = "{http://arxiv.org/schemas/atom}affiliation"

XML_DOI = "{http://arxiv.org/schemas/atom}doi"
XML_JOURNAL_REF = "{http://arxiv.org/schemas/atom}journal_ref"
XML_LINKS = "{http://www.w3.org/2005/Atom}link"

XML_PRIMARY_CATEGORY = "{http://arxiv.org/schemas/atom}primary_category"
XML_CATEGORY = "{http://www.w3.org/2005/Atom}category"




entries_list = []
entries_root = parsed.findall("./"+XML_ENTRIES)

if parsed.find("./"+XML_ENTRIES):
    for entry in entries_root:        
        entry_list = []
        
        entry_list.append(entry.find("./"+XML_ID).text) # Entry ID
        entry_list.append(entry.find("./"+XML_UPDATED).text) # Entry Update Date
        entry_list.append(entry.find("./"+XML_PUBLISHED).text) # Entry Published Date
        entry_list.append(entry.find("./"+XML_TITLE).text) # Entry Title
        entry_list.append(entry.find("./"+XML_SUMMARY).text) # Entry Summary
                
        # List of entry authors and their affiliation is any
        authors_name_list = []
        authors_affiliation_list = []
        for author in entry.findall("./"+XML_AUTHOR):
            authors_name_list.append(author.find("./"+XML_AUTHOR_NAME).text)
            if author.find("./"+XML_AUTHOR_AFFILIATION) != None:
                authors_affiliation_list.append(author.find("./"+XML_AUTHOR_AFFILIATION).text)
        entry_list.append(authors_name_list)
        entry_list.append(authors_affiliation_list)
        
        # Entry DOI
        doi = entry.find("./"+XML_DOI)
        if doi is not None:
            entry_list.append(doi.text)
        else:
            entry_list.append(None)
            
        # Entry Journal Ref
        jref = entry.find("./"+XML_JOURNAL_REF)
        if jref is not None:
            entry_list.append(jref.text)
        else:
            entry_list.append(None)

        # Entry PDF link
        for links in entry.findall("./"+XML_LINKS):
            if (links.attrib.get("title") == "pdf"): entry_list.append(links.attrib.get("href"))
                
        # Entry primary category
        entry_list.append(entry.find("./"+XML_PRIMARY_CATEGORY).attrib.get("term"))
        
        # Entry Categories
        authors_categories_list = []
        for category in entry.findall("./"+XML_CATEGORY):
            authors_categories_list.append(category.attrib.get("term"))
        entry_list.append(authors_categories_list)
        
        # Build entries list
        entries_list.append(entry_list) # Append to main entry list
        
#pp.pprint(entries_list)

In [4]:
HEADERS = ['id', 'updated', 'published', 'title', 'summary', 'authors', 'affiliations', 'doi', 'journal_ref', 'pdf_link', 'primary_category', 'categories']

def create_df(entries):
    df = pd.DataFrame(entries_list, columns=HEADERS)
    df['authors'] = df['authors'].apply(lambda x: '|'.join(x))
    df['affiliations'] = df['affiliations'].apply(lambda x: '|'.join(x))
    df['categories'] = df['categories'].apply(lambda x: '|'.join(x))
    return df

def write_df(entries_df, filename):
    filename = filename+'.csv.bz2'

    if not os.path.exists(filename):
        print('Creating file: '+filename)
        entries_df.to_csv(filename, index=False, compression='bz2', mode='w')
    else:
        print('Writing file: '+filename)
        entries_df.to_csv(filename, index=False, compression='bz2', mode='a', header=False)
        
    print('Done!')

In [5]:
ARXIV_DATA_FILENAME = 'arxiv_data_'+str(int(time.time()))

df = create_df(entries_list)
write_df(df, ARXIV_DATA_FILENAME)

#df = create_df(entries_list)
#write_df(df, ARXIV_DATA_FILENAME)

#df = create_df(entries_list)
#write_df(df, ARXIV_DATA_FILENAME)

Creating file: arxiv_data_1556874672.csv.bz2
Done!
