# Arxiv Crawler

In [1]:
import pandas as pd
import time
import os
import re

In [25]:
XML_ENTRIES = "{http://www.w3.org/2005/Atom}entry"
XML_ID = "{http://www.w3.org/2005/Atom}id"
XML_UPDATED = "{http://www.w3.org/2005/Atom}updated"
XML_PUBLISHED = "{http://www.w3.org/2005/Atom}published"
XML_TITLE = "{http://www.w3.org/2005/Atom}title"
XML_SUMMARY = "{http://www.w3.org/2005/Atom}summary"

XML_AUTHOR = "{http://www.w3.org/2005/Atom}author"
XML_AUTHOR_NAME = "{http://www.w3.org/2005/Atom}name"
XML_AUTHOR_AFFILIATION = "{http://arxiv.org/schemas/atom}affiliation"

XML_DOI = "{http://arxiv.org/schemas/atom}doi"
XML_JOURNAL_REF = "{http://arxiv.org/schemas/atom}journal_ref"
XML_LINKS = "{http://www.w3.org/2005/Atom}link"

XML_PRIMARY_CATEGORY = "{http://arxiv.org/schemas/atom}primary_category"
XML_CATEGORY = "{http://www.w3.org/2005/Atom}category"


def parse_to_list(parsed):
    entries_list = []
    entries_root = parsed.findall("./"+XML_ENTRIES)
    
    if parsed.find("./"+XML_ENTRIES):
        for entry in entries_root:        
            entry_list = []

            entry_list.append(entry.find("./"+XML_ID).text) # Entry ID
            entry_list.append(entry.find("./"+XML_UPDATED).text) # Entry Update Date
            entry_list.append(entry.find("./"+XML_PUBLISHED).text) # Entry Published Date
            entry_list.append(entry.find("./"+XML_TITLE).text) # Entry Title
            entry_list.append(entry.find("./"+XML_SUMMARY).text) # Entry Summary

            # List of entry authors and their affiliation is any
            authors_name_list = []
            authors_affiliation_list = []
            for author in entry.findall("./"+XML_AUTHOR):
                authors_name_list.append(author.find("./"+XML_AUTHOR_NAME).text)
                if author.find("./"+XML_AUTHOR_AFFILIATION) != None:
                    authors_affiliation_list.append(author.find("./"+XML_AUTHOR_AFFILIATION).text)
            entry_list.append(authors_name_list)
            entry_list.append(authors_affiliation_list)

            # Entry DOI
            doi = entry.find("./"+XML_DOI)
            if doi is not None:
                entry_list.append(doi.text)
            else:
                entry_list.append(None)

            # Entry Journal Ref
            jref = entry.find("./"+XML_JOURNAL_REF)
            if jref is not None:
                entry_list.append(jref.text)
            else:
                entry_list.append(None)

            # Entry PDF link
            for links in entry.findall("./"+XML_LINKS):
                if (links.attrib.get("title") == "pdf"): entry_list.append(links.attrib.get("href"))

            # Entry primary category
            entry_list.append(entry.find("./"+XML_PRIMARY_CATEGORY).attrib.get("term"))

            # Entry Categories
            authors_categories_list = []
            for category in entry.findall("./"+XML_CATEGORY):
                for term in category.attrib.get("term").split(';'):
                    authors_categories_list.append(term.strip())
            entry_list.append(authors_categories_list)

            # Build entries list
            entries_list.append(entry_list) # Append to main entry list
            
        return entries_list


In [None]:
# https://arxiv.org/help/api/index
# https://arxiv.org/help/api/user-manual

import xml.etree.ElementTree as ET
from urllib.request import urlopen
import socket
import time
from math import ceil

#http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1
#http://export.arxiv.org/api/query?search_query=cat:cs.LG&sortBy=submittedDate&sortOrder=descending&&start='+str(n_start)+'&max_results='+str(max_articles_per_attempt)

base_url = 'http://export.arxiv.org/api/query?search_query='
categroy = "cat:cs" #all:
subject = "*" #deep
sort_by="lastUpdatedDate"
sort_order="descending"

start_value = len(entries_list) #0
max_results = 1000
n_attempts = 999999999
max_articles_per_attempt = 1000
max_tries = 10
sleep_time = 5
socket.setdefaulttimeout(120.0)

if start_value <= 0:
    entries_list = []
start_time = time.time()
for i in range(n_attempts):
    print("\r\t>>> Downloading chunk {} from {} with {} elements".format(i+1, n_attempts,len(entries_list)), end='')
    
    n_start = start_value+i*max_articles_per_attempt
        
    url = (base_url + categroy + subject +
           "&start=" + str(n_start) +
           "&max_results=" + str(max_articles_per_attempt) +
           "&sortBy=" + sort_by +
           "&sortOrder" + sort_order)
    
    data = urlopen(url, None, timeout=120).read()
    parsed = ET.fromstring(data)
    
    
    if i==0:
        total_results = int(parsed.find("{http://a9.com/-/spec/opensearch/1.1/}totalResults").text)
        if total_results == 0:
            print("\tNO RESULTS-> ABORT NOW")
            break
        else:
            n_attempts = ceil(total_results/max_articles_per_attempt)-len(entries_list)/max_articles_per_attempt
    
    if parsed.find("./"+XML_ENTRIES):
        entries_list+=parse_to_list(parsed)
    else:
        print("\tNO ENTRIES-> TRYING AGAIN")
        for j in range(max_tries):
            print("Try: "+str(j+1)+"/"+str(max_tries))
            time.sleep(5)
            data = urlopen(url, None, timeout=120).read()
            parsed = ET.fromstring(data)
            if parsed.find("./"+XML_ENTRIES):
                entries_list+=parse_to_list(parsed)
                break
                
    if not parsed.find("./"+XML_ENTRIES):
        print("\tTERMINATED")
        print(url)
        break 


end_time = time.time()
print("->\tRunning time is {}s".format(round(end_time-start_time,2)))

	>>> Downloading chunk 1 from 999999999 with 18000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
	>>> Downloading chunk 3 from 217 with 20000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
Try: 3/5
	>>> Downloading chunk 4 from 217 with 21000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
Try: 3/5
	>>> Downloading chunk 5 from 217 with 22000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
Try: 3/5
Try: 4/5
	>>> Downloading chunk 6 from 217 with 23000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
Try: 3/5
	>>> Downloading chunk 7 from 217 with 24000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
	>>> Downloading chunk 8 from 217 with 25000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
	>>> Downloading chunk 9 from 217 with 26000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
	>>> Downloading chunk 10 from 217 with 27000 elements	NO ENTRIES-> TRYING AGAIN
Try: 1/5
Try: 2/5
	>>> Downloading chunk 11 from 217 with 28000 elements	NO ENTRIES-> TRYING AGAIN
Try

In [37]:
len(entries_list)

18000

In [31]:
url

'http://export.arxiv.org/api/query?search_query=cat:cs*&start=5000&max_results=1000&sortBy=lastUpdatedDate&sortOrderdescending'

In [23]:
HEADERS = ['id', 'updated', 'published', 'title', 'summary', 'authors', 'affiliations', 'doi', 'journal_ref', 'pdf_link', 'primary_category', 'categories']

def sanitize_str(s):
    if s:
        s = re.sub('[\r\n]+', ' ', s)
        s = re.sub(' +', ' ', s)
        s = re.sub('\$(.|\n)+?\$', '', s)
        s = re.sub('\$\$(.|\n)+?\$\$', '', s)
        s = re.sub('\(', '', s)
        s = re.sub('\s\s+', ' ', s)
        
        return s.strip()
    return s

def create_df(entries):
    df = pd.DataFrame(entries_list, columns=HEADERS)
    df['authors'] = df['authors'].apply(lambda x: '|'.join(x))
    df['affiliations'] = df['affiliations'].apply(lambda x: '|'.join(x))
    df['categories'] = df['categories'].apply(lambda x: '|'.join(x))
    return df.applymap(sanitize_str)

def write_df(entries_df, filename):
    filename = filename+'.csv.bz2'

    if not os.path.exists(filename):
        print('Creating file: '+filename)
        entries_df.to_csv(filename, index=False, compression='bz2', mode='w')
    else:
        print('Writing file: '+filename)
        entries_df.to_csv(filename, index=False, compression='bz2', mode='a', header=False)
        
    print('Done!')
    
def pickle_data(entries, filename):
    pd.set_option('display.max_columns', None)
    filename = "pickles/"+filename+'.pickle.bz2'
    df = pd.DataFrame(entries_list, columns=HEADERS)
    df.summary = df.summary.map(sanitize_str)
    print("Done!",filename)
    return df.to_pickle(filename, compression='bz2')
    

In [10]:
ARXIV_DATA_FILENAME = 'arxiv_data_'+str(int(time.time()))

df = create_df(entries_list)
write_df(df, ARXIV_DATA_FILENAME)

Creating file: arxiv_data_1560013031.csv.bz2
Done!


In [24]:
ARXIV_DATA_PICKLE = 'arxiv_data_'+str(int(time.time()))

pickle_data(entries_list, ARXIV_DATA_PICKLE)

Done! pickles/arxiv_data_1560034060.pickle.bz2
