# Arxiv Collector for OpenTLDR (Update)
This notebook pulls raw data from the internet and formats it for OpenTLDR ingest as a .json file.
The output json file can be shared with multiple OpenTLDR instances as a file system or S3 bucket.


In [None]:

import os
import logging
from datetime import datetime, date

from dotenv import load_dotenv
load_dotenv()

from RssFeed import RssFeed
from RepoWriter import RepoWriter


## Parameters
Parameters can be passed into the notebook using OpenTLDR Workflows for automation.
The values set in the cell below are set as defaults for this collector, if not overridden by the Workflow.

In [None]:
# Parameters

# The path where this agent should write the .json files for content
live_data_repo_path = os.getenv('LIVE_DATA_REPO_PATH')

# Name of Source to use in OpenTLDR objects
rss_source = "arxiv"
organize_by_source = False

# URLs for the feeds to scrape for this Source
rss_feed_urls = [
    "http://rss.arxiv.org/rss/cs.AI",
    "http://rss.arxiv.org/rss/cs.CR",
    "http://rss.arxiv.org/rss/cs.AR",
    "http://rss.arxiv.org/rss/cs.CV",
    "http://rss.arxiv.org/rss/cs.CR",
    "http://rss.arxiv.org/rss/cs.DL",
    "http://rss.arxiv.org/rss/cs.ET",
    "http://rss.arxiv.org/rss/cs.MA",
    "http://rss.arxiv.org/rss/cs.NE",
    "http://rss.arxiv.org/rss/cs.SC",
]

# Some sites verify recent user_agent configurations
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"

download_media = True

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# level of unnecessary output
verbose = True


# Process Arxiv RSS Feed
This collector pulls entries from the RSS Feed and structures them for OpenTLDR Ingest.
There are several Arxiv-specific clean-up and extra steps (e.g., download the referenced pdf file).

In [None]:

reader=RssFeed(rss_source)

# Determine how to write out data

if live_data_repo_path is None:
    live_data_repo_path = os.path.join("..","Data","Live")

write_path = os.path.join(live_data_repo_path)

if organize_by_source:
    write_path= os.path.join(live_data_repo_path,rss_source)

print ("Writing out json objects to: {}".format(write_path))
writer=RepoWriter(write_path)

#reader.archive = os.path.join(live_data_repo_path,"rss_archive",datetime.now().strftime("%Y-%m-%d"))

# For Debugging
reader.set_log_level(logging_level)
writer.set_log_level(logging_level)

for feed in rss_feed_urls:
    for entry in reader.fetch_feed(feed, type='technical paper'):
        
        # Clean up source-specific issues
        entry['text'] = entry['text'].split("\nAbstract:")[1].strip() 

        # Source-specific media collection
        source_pdf_url = entry['url'].replace("abs","pdf")
        entry['metadata']['full_content_pdf']= source_pdf_url

        source_html_url = entry['url'].replace("abs","html")
        entry['metadata']['full_content_html']= source_html_url

        # Write out a content node 
        hash = writer.write_content(entry)
        
        # Some entries are skipped (hash == None), otherwise download Full PDF file in raw folder
        if hash is not None and download_media:
            try:
                media_dir = os.path.join(write_path, entry['date'],"media")
                pdf_media_path = os.path.join(media_dir,"{}.pdf".format(hash))
                html_media_path = os.path.join(media_dir,"{}.html".format(hash))

                if not os.path.exists(media_dir):
                    os.makedirs(media_dir)

                print ("Downloading media files for {}".format(hash))
                writer.http_copy(source_pdf_url, pdf_media_path)
                writer.http_copy(source_html_url, html_media_path)

            except:
                print ("Failed to download PDF file for {}.".format(hash))

  