In [1]:
%matplotlib inline

# arxivpy is a specialized library to extract the very nested XML that arxiv provides.  
# https://github.com/titipata/arxivpy
# One dependency for arxivpy is feedparser, but be very careful when installing that in conda - It will try to force a
# python downgrade.  https://github.com/ContinuumIO/anaconda-issues/issues/10221#issuecomment-433100755 for a commandline
# option to stop forcing downgrades.  (Supposedly fixed for more recent conda installs but I still have this issue on Mac.)
import arxivpy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time

ModuleNotFoundError: No module named 'arxivpy'

In [None]:
# Create empty dataframe
df = pd.DataFrame()

In [None]:
# This query is written for a user to get a sample of data to test the other notebooks.  It will take days and a lot of
# work, patience, and troubleshooting to obtain the entire dataset that I'm using.  If I'm able to find a service that can
# host my data (~250mb), a link will be provided in the project README.

# Download 1,000 articles from the relevant subsections of arXiv.  (Max results per iteration = 2000, but they prefer less.)
# NOTE: If you want to download more than 1k articles as I've written for you, you'll need to plot your downloads.  ArXiv
# has the flakiest API, and arxivpy prevents you from knowing when it fails.  You'll know if the code as I've written it
# fails because it will make an empty df.  Just try again in a few minutes.
articles = arxivpy.query(search_query=['astro-ph.CO', 'astro-ph.GA', 'astro-ph.EP', 
                                       'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR'],
                         start_index=0, max_index=999, 
                         results_per_iteration=1000,
                         wait_time=5.0, sort_by='lastUpdatedDate')

In [None]:
data = pd.DataFrame.from_dict(articles)
df = df.append(data,ignore_index=True)

# These lines are necessary if you need to rerun the data on a non-empty df
df.drop_duplicates(inplace=True)
df.sort_values('update_date',ascending=False,inplace=True)
df.reset_index(drop=True,inplace=True)

In [None]:
# Plot the update_date by the index to check for unexpected gaps
_ = plt.figure(figsize=(12,8))
_ = df.update_date.plot()
_ = plt.xlabel('Index value')
_ = plt.ylabel('Update date')
_ = plt.title('Index vs date\n(check for non-weekend holes)')

In [None]:
# There should be zero null values
df.info()

In [None]:
# Check the structure of the entries
# NOTE: journal_ref will be removed in next notebook as all entries are 'No journal ref found.'
df.iloc[500]

In [None]:
# Save to csv in order to properly run future notebooks
df.to_csv('astro_yourtestdata_1k.csv')