# Arxiv Crawler
## Limitation
https://cran.r-project.org/web/packages/aRxiv/vignettes/aRxiv.html
- maximum 50k records per query

In [1]:
import pandas as pd
import time
import os
import re

In [25]:
XML_ENTRIES = "{http://www.w3.org/2005/Atom}entry"
XML_ID = "{http://www.w3.org/2005/Atom}id"
XML_UPDATED = "{http://www.w3.org/2005/Atom}updated"
XML_PUBLISHED = "{http://www.w3.org/2005/Atom}published"
XML_TITLE = "{http://www.w3.org/2005/Atom}title"
XML_SUMMARY = "{http://www.w3.org/2005/Atom}summary"

XML_AUTHOR = "{http://www.w3.org/2005/Atom}author"
XML_AUTHOR_NAME = "{http://www.w3.org/2005/Atom}name"
XML_AUTHOR_AFFILIATION = "{http://arxiv.org/schemas/atom}affiliation"

XML_DOI = "{http://arxiv.org/schemas/atom}doi"
XML_JOURNAL_REF = "{http://arxiv.org/schemas/atom}journal_ref"
XML_LINKS = "{http://www.w3.org/2005/Atom}link"

XML_PRIMARY_CATEGORY = "{http://arxiv.org/schemas/atom}primary_category"
XML_CATEGORY = "{http://www.w3.org/2005/Atom}category"


def parse_to_list(parsed):
    entries_list = []
    entries_root = parsed.findall("./"+XML_ENTRIES)
    
    if parsed.find("./"+XML_ENTRIES):
        for entry in entries_root:        
            entry_list = []

            entry_list.append(entry.find("./"+XML_ID).text) # Entry ID
            entry_list.append(entry.find("./"+XML_UPDATED).text) # Entry Update Date
            entry_list.append(entry.find("./"+XML_PUBLISHED).text) # Entry Published Date
            entry_list.append(entry.find("./"+XML_TITLE).text) # Entry Title
            entry_list.append(entry.find("./"+XML_SUMMARY).text) # Entry Summary

            # List of entry authors and their affiliation is any
            authors_name_list = []
            authors_affiliation_list = []
            for author in entry.findall("./"+XML_AUTHOR):
                authors_name_list.append(author.find("./"+XML_AUTHOR_NAME).text)
                if author.find("./"+XML_AUTHOR_AFFILIATION) != None:
                    authors_affiliation_list.append(author.find("./"+XML_AUTHOR_AFFILIATION).text)
            entry_list.append(authors_name_list)
            entry_list.append(authors_affiliation_list)

            # Entry DOI
            doi = entry.find("./"+XML_DOI)
            if doi is not None:
                entry_list.append(doi.text)
            else:
                entry_list.append(None)

            # Entry Journal Ref
            jref = entry.find("./"+XML_JOURNAL_REF)
            if jref is not None:
                entry_list.append(jref.text)
            else:
                entry_list.append(None)

            # Entry PDF link
            for links in entry.findall("./"+XML_LINKS):
                if (links.attrib.get("title") == "pdf"): entry_list.append(links.attrib.get("href"))

            # Entry primary category
            entry_list.append(entry.find("./"+XML_PRIMARY_CATEGORY).attrib.get("term"))

            # Entry Categories
            authors_categories_list = []
            for category in entry.findall("./"+XML_CATEGORY):
                for term in category.attrib.get("term").split(';'):
                    authors_categories_list.append(term.strip())
            entry_list.append(authors_categories_list)

            # Build entries list
            entries_list.append(entry_list) # Append to main entry list
            
        return entries_list


In [53]:
entries_list = backup_entries_list
len(entries_list)

53022

In [None]:
# https://arxiv.org/help/api/index
# https://arxiv.org/help/api/user-manual

import xml.etree.ElementTree as ET
from urllib.request import urlopen
import socket
import time
from math import ceil

#http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1
#http://export.arxiv.org/api/query?search_query=cat:cs.LG&sortBy=submittedDate&sortOrder=descending&&start='+str(n_start)+'&max_results='+str(max_articles_per_attempt)

date_start = "201001010000"
date_end = "20120905175052"#"20151226190200"#"20170909215812" #20180712194129
base_url = 'http://export.arxiv.org/api/query?search_query='
cat_type = "cat"
categroy = cat_type+":cs" #all:
subject = "*+AND+submittedDate:["+date_start+"+"+date_end+"TO+]"#"*+AND+submittedDate:[201001010000+TO+20180712194129]" #"deep" # "\"electron thermal conductivity\""
sort_by="lastUpdatedDate"
sort_order="descending"#"ascending"

#2018-07-12T19:41:20Z

start_value = 0#len(entries_list) #0
max_results = 1000
n_attempts = 999999999
max_articles_per_attempt = 1000
max_tries = 10
sleep_time = 5
socket.setdefaulttimeout(120.0)

if start_value <= 0:
    entries_list = []
start_time = time.time()
for i in range(n_attempts):
    print("\r\t>>> Downloading chunk {} from {} with {} elements".format(i+1, n_attempts,len(entries_list)), end=' ')
    
    n_start = start_value+i*max_articles_per_attempt
        
    url = (base_url + categroy + subject +
           "&start=" + str(n_start) +
           "&max_results=" + str(max_articles_per_attempt) +
           "&sortBy=" + sort_by +
           "&sortOrder" + sort_order)
    
    data = urlopen(url, None, timeout=120).read()
    parsed = ET.fromstring(data)
    
    
    if i==0:
        total_results = int(parsed.find("{http://a9.com/-/spec/opensearch/1.1/}totalResults").text)
        if total_results == 0:
            print("\tNO RESULTS-> ABORT NOW")
            print(url)
            break
        else:
            n_attempts = ceil(total_results/max_articles_per_attempt)-len(entries_list)/max_articles_per_attempt
    
    if parsed.find("./"+XML_ENTRIES):
        entries_list+=parse_to_list(parsed)
    else:
        print("\tNO ENTRIES-> TRYING AGAIN")
        print(url)
        for j in range(max_tries):
            print("Try: "+str(j+1)+"/"+str(max_tries))
            time.sleep(10)
            data = urlopen(url, None, timeout=120).read()
            parsed = ET.fromstring(data)
            if parsed.find("./"+XML_ENTRIES):
                entries_list+=parse_to_list(parsed)
                break
                
    if not parsed.find("./"+XML_ENTRIES):
        print("\tTERMINATED")
        print(url)
        break 


end_time = time.time()
print("->\tRunning time is {}s".format(round(end_time-start_time,2)))

	>>> Downloading chunk 2 from 26.0 with 1000 elements s 	NO ENTRIES-> TRYING AGAIN
http://export.arxiv.org/api/query?search_query=cat:cs*+AND+submittedDate:[201001010000+20120905175052TO+]&start=1000&max_results=1000&sortBy=lastUpdatedDate&sortOrderdescending
Try: 1/10
	>>> Downloading chunk 3 from 26.0 with 2000 elements 	NO ENTRIES-> TRYING AGAIN
http://export.arxiv.org/api/query?search_query=cat:cs*+AND+submittedDate:[201001010000+20120905175052TO+]&start=2000&max_results=1000&sortBy=lastUpdatedDate&sortOrderdescending
Try: 1/10
Try: 2/10
	>>> Downloading chunk 5 from 26.0 with 4000 elements 	NO ENTRIES-> TRYING AGAIN
http://export.arxiv.org/api/query?search_query=cat:cs*+AND+submittedDate:[201001010000+20120905175052TO+]&start=4000&max_results=1000&sortBy=lastUpdatedDate&sortOrderdescending
Try: 1/10
	>>> Downloading chunk 7 from 26.0 with 6000 elements 	NO ENTRIES-> TRYING AGAIN
http://export.arxiv.org/api/query?search_query=cat:cs*+AND+submittedDate:[201001010000+20120905175052TO

In [77]:
len(entries_list)

50000

In [80]:
entries_list[-1]

['http://arxiv.org/abs/1209.1060v11',
 '2014-08-03T13:51:49Z',
 '2012-09-05T17:50:52Z',
 'Combinatorial Spaces And Order Topologies',
 '  An archetypal problem discussed in computer science is the problem of\nsearching for a given number in a given set of numbers. Other than sequential\nsearch, the classic solution is to sort the list of numbers and then apply\nbinary search. The binary search problem has a complexity of O(logN) for a list\nof N numbers while the sorting problem cannot be better than O(N) on any\nsequential computer following the usual assumptions. Whenever the problem of\ndeciding partial order can be done in O(1), a variation of the problem on some\nbounded list of numbers is to apply binary search without resorting to sort.\nThe overall complexity of the problem is then O(log R) for some radius R. A\nlogarithmic upper-bound for finite encodings is shown. Also, the topology of\norderings can provide efficient algorithms for search problems in combinatorial\nspaces. T

In [68]:
from copy import deepcopy

backup_entries_list = deepcopy(entries_list)

In [69]:
len(backup_entries_list)

49600

In [31]:
url

'http://export.arxiv.org/api/query?search_query=cat:cs*&start=5000&max_results=1000&sortBy=lastUpdatedDate&sortOrderdescending'

In [23]:
HEADERS = ['id', 'updated', 'published', 'title', 'summary', 'authors', 'affiliations', 'doi', 'journal_ref', 'pdf_link', 'primary_category', 'categories']

def sanitize_str(s):
    if s:
        s = re.sub('[\r\n]+', ' ', s)
        s = re.sub(' +', ' ', s)
        s = re.sub('\$(.|\n)+?\$', '', s)
        s = re.sub('\$\$(.|\n)+?\$\$', '', s)
        s = re.sub('\(', '', s)
        s = re.sub('\s\s+', ' ', s)
        
        return s.strip()
    return s

def create_df(entries):
    df = pd.DataFrame(entries_list, columns=HEADERS)
    df['authors'] = df['authors'].apply(lambda x: '|'.join(x))
    df['affiliations'] = df['affiliations'].apply(lambda x: '|'.join(x))
    df['categories'] = df['categories'].apply(lambda x: '|'.join(x))
    return df.applymap(sanitize_str)

def write_df(entries_df, filename):
    filename = filename+'.csv.bz2'

    if not os.path.exists(filename):
        print('Creating file: '+filename)
        entries_df.to_csv(filename, index=False, compression='bz2', mode='w')
    else:
        print('Writing file: '+filename)
        entries_df.to_csv(filename, index=False, compression='bz2', mode='a', header=False)
        
    print('Done!')
    
def pickle_data(entries, filename):
    pd.set_option('display.max_columns', None)
    filename = "pickles/"+filename+'.pickle.bz2'
    df = pd.DataFrame(entries_list, columns=HEADERS)
    df.summary = df.summary.map(sanitize_str)
    print("Done!",filename)
    return df.to_pickle(filename, compression='bz2')
    

In [10]:
ARXIV_DATA_FILENAME = 'arxiv_data_'+str(int(time.time()))

df = create_df(entries_list)
write_df(df, ARXIV_DATA_FILENAME)

Creating file: arxiv_data_1560013031.csv.bz2
Done!


In [76]:
dates = date_start+"_to_"+date_end
ARXIV_DATA_PICKLE = "arxiv_data_"+cat_type+"_"+dates+"_"+str(int(time.time()))

pickle_data(entries_list, ARXIV_DATA_PICKLE)

Done! pickles/arxiv_data_cat_201001010000_to_20151226190200_1560070379.pickle.bz2
