In [3]:
# ! pip install boto3

Collecting boto3
  Downloading boto3-1.28.47-py3-none-any.whl (135 kB)
     -------------------------------------- 135.8/135.8 kB 1.3 MB/s eta 0:00:00
Collecting botocore<1.32.0,>=1.31.47
  Downloading botocore-1.31.47-py3-none-any.whl (11.2 MB)
     ---------------------------------------- 11.2/11.2 MB 2.4 MB/s eta 0:00:00
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.2-py3-none-any.whl (79 kB)
     ---------------------------------------- 79.8/79.8 kB 2.2 MB/s eta 0:00:00
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.28.47 botocore-1.31.47 jmespath-1.0.1 s3transfer-0.6.2



[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import boto3, configparser, os, botocore, numpy as np, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET, tarfile, oauth2client
from enum import Enum 
import os, gzip, shutil

In [2]:
s3resource = None

def setup():
    """Creates S3 resource & sets configs to enable download."""

    # Securely import configs from private AWS config file
    configs = configparser.ConfigParser()
    configs.read('config.ini')

    # Create S3 resource & set configs
    global s3resource
    s3resource = boto3.resource(
        's3',  # the AWS resource we want to use
        aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
        aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
        region_name='us-east-1'  # same region the arxiv bucket is in
    )
    
setup()

In [3]:
def download_file(key):
    """
    Downloads given filename from source bucket to destination directory.

    Parameters
    ----------
    key : str
        Name of file to download
    """

    # Ensure src directory exists 
    if not os.path.isdir('src_files'):
        os.makedirs('src_files')
    
    print('Downloading s3://arxiv/{}'.format(key))
    
    # Download file
    try:
        s3resource.meta.client.download_file(
            Bucket='arxiv', 
            Key=key,  # name of key to download from
            Filename=key,  # path to file to download to
            ExtraArgs={'RequestPayer':'requester'})
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in arxiv bucket")
            
    print('Successfully downloaded s3://arxiv/{} to {}'.format(key, key))

In [4]:
download_file('src/arXiv_src_manifest.xml')

Downloading s3://arxiv/src/arXiv_src_manifest.xml.gz
ERROR: src/arXiv_src_manifest.xml.gz does not exist in arxiv bucket
Successfully downloaded s3://arxiv/src/arXiv_src_manifest.xml.gz to src/arXiv_src_manifest.xml.gz


In [3]:
class XMLTagsUpperLevel:
    """
    This class defines the XML tag constants at the higher level of XML tree. The tag <file> is found below the root tag
    <arXivSRC> in the tree hierarchy.
    """
    FILE = "file"
    
class XMLTagsLowerLevel(Enum):
    """
    This class defines all the XML tag constants that are one level below the <file> tag. This is defined as an
    enumerated type for ease of iterating over all tags.
    """
    CONTENT_MD5SUM = "content_md5sum"
    FILENAME = "filename"
    FIRST_ITEM = "first_item"
    LAST_ITEM = "last_item"
    MD5SUM = "md5sum"
    NUM_ITEMS = "num_items"
    SEQ_NUM = "seq_num"
    SIZE = "size"
    TIMESTAMP = "timestamp"
    YYMM = "yymm"

class XMLParser:
    def __init__(self, file_path):
        """
        Initializes the XMLParser class instance.
        :param file_path: Path to input xml file containing all the jobs data.
        """
        self.file_path = file_path


    def xml_to_pandas_df(self):
        """
        Using the standard xml python library, we parse the data xml file and convert the xml data to a pandas
        data frame.
        :return: A pandas data frame instance containing all the manifest data.
        """
        tree = ET.parse(self.file_path)
        root = tree.getroot()

        manifest_data = dict()
        for tag in XMLTagsLowerLevel:
            manifest_data[tag.value] = []
    
        for i, record in enumerate(root.findall(XMLTagsUpperLevel.FILE)):
            for tag in XMLTagsLowerLevel:
                temp = record.find(tag.value)
                if temp is not None:
                    manifest_data[tag.value].append(temp.text)
                else:
                    manifest_data[tag.value].append("")

        return pd.DataFrame(data=manifest_data)
    

In [5]:
parser = XMLParser("src/arXiv_src_manifest.xml")
manifest_src = parser.xml_to_pandas_df()
manifest_src

Unnamed: 0,content_md5sum,filename,first_item,last_item,md5sum,num_items,seq_num,size,timestamp,yymm
0,cacbfede21d5dfef26f367ec99384546,src/arXiv_src_0001_001.tar,astro-ph0001001,quant-ph0001119,949ae880fbaf4649a485a8d9e07f370b,2364,1,225605507,2010-12-23 00:13:59,0001
1,d90df481661ccdd7e8be883796539743,src/arXiv_src_0002_001.tar,astro-ph0002001,quant-ph0002094,4592ab506cf775afecf4ad560d982a00,2365,1,227036528,2010-12-23 00:18:09,0002
2,3388afd7bfb2dfd9d3f3e6b353357b33,src/arXiv_src_0003_001.tar,astro-ph0003001,quant-ph0003151,b5bf5e52ae8532cdf82b606b42df16ea,2600,1,230986882,2010-12-23 00:22:15,0003
3,46abb309d77065fed44965cc26a4ae2e,src/arXiv_src_0004_001.tar,astro-ph0004001,quant-ph0004109,9bf1b55890dceec9535ef723a2aea16b,2076,1,191559408,2010-12-23 00:26:31,0004
4,ea665c7b62eaac91110fa344f6ba3fc4,src/arXiv_src_0005_001.tar,astro-ph0005001,quant-ph0005134,b49af416746146eca13c5a6a76bc7193,2724,1,255509072,2010-12-23 00:30:11,0005
...,...,...,...,...,...,...,...,...,...,...
6798,e5aab1cf90112dca29e4adbb47e8ec13,src/arXiv_src_9908_001.tar,adap-org9908001,solv-int9908009,85ba88687c4485af297d04680637f84e,2139,1,181028024,2010-12-22 23:52:48,9908
6799,1ed0bc87e6b40f64b81295f5652f9938,src/arXiv_src_9909_001.tar,adap-org9909001,solv-int9909028,9d3a7f34f1ac44ad3f495201e24dac90,2500,1,213668821,2010-12-22 23:56:18,9909
6800,4aca046629928ca62367cd65a91a9722,src/arXiv_src_9910_001.tar,adap-org9910001,solv-int9910012,2f3cb5f12de7f8d7051bff7d0efdf2dd,2571,1,244741133,2010-12-23 00:00:30,9910
6801,eba14a08d08e8c356c4d1cdafef68035,src/arXiv_src_9911_001.tar,adap-org9911001,solv-int9911009,af128903c94a5db15b6b5bfbe472ca2f,2517,1,221549097,2010-12-23 00:05:11,9911


In [6]:
parser = XMLParser("src/arXiv_pdf_manifest.xml")
manifest_pdf = parser.xml_to_pandas_df()
manifest_pdf

Unnamed: 0,content_md5sum,filename,first_item,last_item,md5sum,num_items,seq_num,size,timestamp,yymm
0,07234b3a03a15391c5cf7ce4905c136f,pdf/arXiv_pdf_0001_001.tar,astro-ph0001001,math0001014,16857799a6b343586691b56eace826fb,1797,1,524304163,2019-05-22 03:20:55,0001
1,7afc83808b796d094d608f66ffed4ab2,pdf/arXiv_pdf_0001_002.tar,math0001015,quant-ph0001119,61bb68ffc93b0299be2dad57adc0f740,548,2,130295764,2019-05-22 03:22:22,0001
2,303216bab46f6a95c1d5996fd77d3c7c,pdf/arXiv_pdf_0002_001.tar,astro-ph0002001,hep-th0002228,3d493bcef617c34ae41e281a948cf3a9,1737,1,524361094,2019-05-22 03:26:00,0002
3,1f09f5c979a6ef91de1465120be74fb6,pdf/arXiv_pdf_0002_002.tar,hep-th0002229,quant-ph0002094,15793dbd9724ee65916765f4f89aaa41,614,2,145833079,2019-05-22 03:28:15,0002
4,5b963460adabef5149c380f7e8217f38,pdf/arXiv_pdf_0003_001.tar,astro-ph0003001,hep-th0003235,ce91d493c6fec721e2cad2cda093c7fc,1815,1,524621983,2019-05-22 03:31:08,0003
...,...,...,...,...,...,...,...,...,...,...
6428,af2e9e23f244eda4469a4ff09438516a,pdf/arXiv_pdf_9910_002.tar,hep-th9910111,solv-int9910012,d04ada968e06ecb4444eb23e5173ba08,688,2,159880720,2019-05-22 03:04:34,9910
6429,0ca596e581ad3df2a5ae0b7baac1334e,pdf/arXiv_pdf_9911_001.tar,adap-org9911001,math9911095,8b72b633a607ee43361a3b950b8c6984,1965,1,524330416,2019-05-22 03:09:16,9911
6430,2fc1f3bcf18cb1fe4b007c1a089c1c49,pdf/arXiv_pdf_9911_002.tar,math9911096,solv-int9911009,e8064b5bdb58db5db8cec292c5db4ce7,523,2,114035934,2019-05-22 03:10:12,9911
6431,3a7801e69106762b48364a1e5a50c4fa,pdf/arXiv_pdf_9912_001.tar,adap-org9912001,hep-th9912216,32301619517654a713e2d9cbb6ff2350,1929,1,524426286,2019-05-22 03:16:26,9912


In [7]:
# last updated
with open('src/arXiv_src_manifest.xml', 'r') as manifest:
    soup = BeautifulSoup(manifest, 'lxml-xml')
    timestamp = soup.arXivSRC.find('timestamp', recursive=False).string
    print('Manifest was last edited on ' + timestamp)

Manifest was last edited on Thu Sep  7 05:37:20 2023


In [14]:
# total files and size in GB for source files
print(str(len(manifest_src)) + " files")
manifest_src['size'] = pd.to_numeric(manifest_src['size'])
print(str(round(manifest_src['size'].sum() / 1000000000, 2)) + ' GB')

6803 files
3591.92 GB


In [9]:
# total files and size in GB for pdf files
print(str(len(manifest_pdf)) + " files")
manifest_pdf['size'] = pd.to_numeric(manifest_pdf['size'])
print(str(round(manifest_pdf['size'].sum() / 1000000000, 2)) + ' GB')

6433 files
3344.62 GB


In [10]:
# timestamp of oldest and most recent file
manifest_src['timestamp'] = pd.to_datetime(manifest_src['timestamp'])
print('Oldest file was uploaded on ' + manifest_src['timestamp'].min().strftime('%m/%d/%Y'))
print('Most recent file was uploaded on ' +  manifest_src['timestamp'].max().strftime('%m/%d/%Y'))

Oldest file was uploaded on 12/22/2010
Most recent file was uploaded on 09/07/2023
