# Data acquisition

In [1]:
%matplotlib inline

# filter warnings
import warnings
warnings.filterwarnings("ignore")

# data manipulation
import pandas as pd

# request modules
import time
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# set font scale and style
plt.rcParams.update({'font.size': 15})

In [2]:
# import custom class
%run -i '../src/helper/scraper.py'

In [3]:
# instantiate the class
scraper = Scraper()

In [4]:
df_condmat = scraper.get_data(arxiv='physics:cond-mat')

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:cond-mat
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|1001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|2001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|3001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|4001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|5001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|6001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|7001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|8001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|9001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977288|10001
fetching http://export.arxiv.

In [5]:
df_astro = scraper.get_data(arxiv='physics:astro-ph')

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:astro-ph
Got 503. Retrying after 5 seconds.
fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:astro-ph
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|1001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|2001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|3001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|4001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|5001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|6001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|7001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977353|8001
fetching http://export.arxiv.org/

In [6]:
df_gr = scraper.get_data(arxiv='physics:gr-qc')

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:gr-qc
Got 503. Retrying after 5 seconds.
fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:gr-qc
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|1001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|2001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|3001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|4001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|5001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|6001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|7001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977393|8001
fetching http://export.arxiv.org/oai2?v

In [7]:
df_hepth = scraper.get_data(arxiv='physics:hep-th')

fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:hep-th
Got 503. Retrying after 5 seconds.
fetching http://export.arxiv.org/oai2?verb=ListRecords&from=2010-01-01&until=2014-12-31&metadataPrefix=arXiv&set=physics:hep-th
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|1001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|2001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|3001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|4001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|5001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|6001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|7001
fetching http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=5977411|8001
fetching http://export.arxiv.org/oai2

In [10]:
df_condmat['categories'] = 'cond-mat'
df_astro['categories'] = 'astro-ph'
df_gr['categories'] = 'gr-qc'
df_hepth['categories'] = 'hep-th'
df = pd.concat([df_condmat, df_astro, df_gr, df_hepth], sort = False) 

In [11]:
df.to_csv('../data/raw_data.csv', sep = '\t', index=False)

In [14]:
df_hepth.shape

(29122, 8)