# Arxiv Collector for OpenTLDR (Update)
This notebook pulls raw data from the internet and formats it for OpenTLDR ingest as a .json file.
The output json file can be shared with multiple OpenTLDR instances as a file system or S3 bucket.


In [None]:

import os
import logging
from datetime import datetime, date

from dotenv import load_dotenv
load_dotenv()

from RssFeed import RssFeed
from RepoWriter import RepoWriter


## Parameters
Parameters can be passed into the notebook using OpenTLDR Workflows for automation.
The values set in the cell below are set as defaults for this collector, if not overridden by the Workflow.

In [None]:
# Parameters

# The path where this agent should write the .json files for content
live_data_repo_path = os.getenv('LIVE_DATA_REPO_PATH')

# The path where this agent should write full source files temporarily
download_media = False
media_cache_path = os.getenv('MEDIA_CACHE_PATH')
arxiv_file_format = "pdf"

# Name of Source to use in OpenTLDR objects
rss_source = "arxiv"
organize_by_source = True

# URLs for the feeds to scrape for this Source
rss_feed_urls = [
#    "http://rss.arxiv.org/rss/cs.AI",
#    "http://rss.arxiv.org/rss/cs.CR",
#    "http://rss.arxiv.org/rss/cs.AR",
#    "http://rss.arxiv.org/rss/cs.CV",
#    "http://rss.arxiv.org/rss/cs.CR",
#    "http://rss.arxiv.org/rss/cs.DL",
#    "http://rss.arxiv.org/rss/cs.ET",
#    "http://rss.arxiv.org/rss/cs.MA",
#    "http://rss.arxiv.org/rss/cs.NE",
#    "http://rss.arxiv.org/rss/cs.SC",
]

search_terms = [
    "Recent regulatory initiatives like the European AI Act and relevant voices in the Machine Learning (ML) community",
    "The Role of Governments in Increasing Interconnected Post-Deployment Monitoring of AI",
    "Voice-Enabled AI Agents can Perform Common Scams",
]

# Filters - excludes some content
max_search_results = 10
date:str = None
days_history = 9999

# Some sites verify recent user_agent configurations
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# level of unnecessary output
verbose = True


#TEMP
rss_source="cyber"

## Process Parameters
Parameters are passed in as strings that sometimes need to be converted to other object format.

In [None]:
# Set the Data Filtering
# Note - this is mostly useful for search term based filtering

from datetime import datetime, timedelta
import pytz

# Default to today
today = datetime.now()
if date is not None: 
    today = datetime.strptime(date,"%Y-%m-%d")

# defaults to previous 1 day
last_read = today - timedelta(days=days_history)

print("Today is {}, will atttempt to download content since {}.".format(today.strftime('%Y-%m-%d'),last_read.strftime('%Y-%m-%d')))

In [None]:
# add search terms to rss_feeds list

for search_term in search_terms:
    rss_feed_urls.append(f'http://export.arxiv.org/api/query?search_query={search_term.replace(" ", "+")}&sortBy=relevance&start=0&max_results={max_search_results}')

In [None]:
# Verify where the .json files should be stored.
if live_data_repo_path is None:
    live_data_repo_path = os.path.join("..","Data","Live")

if organize_by_source:
    live_data_repo_path= os.path.join(live_data_repo_path,rss_source)

print ("Writing Content to: {}".format(live_data_repo_path))

# Verify where the media files should be stored.
if media_cache_path is None and download_media:
    media_cache_path = os.path.join('..','Data','MediaCache')

if download_media:
    print ("Cacheing Media files to: {}".format(media_cache_path))
    print ("Downloading {} formated files.".format(arxiv_file_format))

In [None]:
# Ensure that these paths exist and create them if they do not

if not os.path.exists(live_data_repo_path):
    print ("Creating directory: {}".format(live_data_repo_path))
    os.makedirs(live_data_repo_path)

if download_media and not os.path.exists(media_cache_path):
    print ("Creating directory: {}".format(media_cache_path))
    os.makedirs(media_cache_path)

# Process Arxiv RSS Feed
This collector pulls entries from the RSS Feed and structures them for OpenTLDR Ingest.
There are several Arxiv-specific clean-up and extra steps (e.g., download the referenced pdf file).

In [None]:
reader=RssFeed(rss_source, user_agent=user_agent)
reader.set_filter(today=today, earliest=last_read)
reader.set_log_level(logging_level)
reader.archive = os.path.join(live_data_repo_path,"rss_archive",datetime.now().strftime("%Y-%m-%d"))

writer=RepoWriter(live_data_repo_path)
writer.set_log_level(logging_level)

for feed in rss_feed_urls:
    try:
        for entry in reader.fetch_feed(feed, type='technical paper'):
            
            # Clean up source-specific issues
            if entry['text'].startswith("\nAbstract:"):
                entry['text'] = entry['text'].split("\nAbstract:")[1].strip() 

            # Source-specific media collection - we record them all but only fetch what is required
            source_pdf_url = entry['url'].replace("abs","pdf")
            entry['metadata']['full_content_pdf']= source_pdf_url

            source_html_url = entry['url'].replace("abs","html")
            entry['metadata']['full_content_html']= source_html_url

            source_latex_url = entry['url'].replace("abs","src")
            entry['metadata']['full_content_latex']= source_latex_url

            # Write out a content node 
            repo_uid = writer.write_content(entry)
            
            # Some entries are skipped (repo_uid == None), otherwise download and cache the document itself
            if repo_uid is not None and download_media:
                try:
                    match arxiv_file_format:
                        case 'pdf':
                            if source_pdf_url is not None:
                                pdf_media_path = os.path.join(media_cache_path,"{}.pdf".format(repo_uid))
                                writer.http_copy(source_pdf_url, pdf_media_path)

                        case 'html':
                            if source_html_url is not None:
                                html_media_path = os.path.join(media_cache_path,"{}.html".format(repo_uid))
                                writer.http_copy(source_html_url, html_media_path)

                        case 'latex':
                            if source_latex_url is not None:
                                latex_media_path = os.path.join(media_cache_path,"{}.html".format(repo_uid))
                                writer.http_copy(source_latex_url, latex_media_path)

                    if verbose:
                        print ("Downloaded media {} files for {}".format(arxiv_file_format, repo_uid))

                except Exception as e:
                    print ("Failed to download media {} file for {}: {}".format(arxiv_file_format, repo_uid, e))
    
    except Exception as e:
        print ("RSS Feed {} had error: {}".format(feed, e))
    