# US Dept of State - Collector for OpenTLDR (Update)
This notebook pulls raw data from the internet and formats it for OpenTLDR ingest as a .json file.
The output json file can be shared with multiple OpenTLDR instances as a file system or S3 bucket.

In this Collector folder are:
- setup_collector.sh - used to create the python virtual environment intended for this collector. This will include pip installing the modules in the requirements.txt file.
- execute.sh - this script can be called from a cronjob in order to automate the collection process.
- .env - this file should be modified for the environment you are running the collector in.

In [None]:
import os
import logging
import re

from dotenv import load_dotenv
load_dotenv()

from RssFeed import RssFeed
from RepoWriter import RepoWriter

In [None]:
# Parameters

# The path where this agent should write the .json files for content
live_data_repo_path = os.getenv('LIVE_DATA_REPO_PATH')

# The path where this agent should write full source files temporarily
download_media = False # State already provides full text in RSS (and it is cleaner)
media_cache_path = os.getenv('MEDIA_CACHE_PATH')

# Name of Source to use in OpenTLDR objects
rss_source = "state"
organize_by_source = True

# URLs for the feeds to scrape for this Source
rss_feed_urls = [
    "https://www.state.gov/rss-feed/press-releases/feed/",
    "https://www.state.gov/rss-feed/secretarys-remarks/feed/",
    "https://www.state.gov/rss-feed/africa/feed/",
    "https://www.state.gov/rss-feed/arms-control-and-international-security/feed/",
    "https://www.state.gov/rss-feed/climate-environment-and-conservation/feed/",
    "https://www.state.gov/rss-feed/collected-department-releases/feed/",
    "https://www.state.gov/rss-feed/counterterrorism/feed/",
    "https://www.state.gov/rss-feed/democracy-human-rights-and-labor/feed/",
    "https://www.state.gov/rss-feed/department-press-briefings/feed/",
    "https://www.state.gov/rss-feed/diplomatic-security/feed/",
    "https://www.state.gov/rss-feed/direct-line-to-american-business/feed/",
    "https://www.state.gov/rss-feed/east-asia-and-the-pacific/feed/",
    "https://www.state.gov/rss-feed/economic-energy-agricultural-and-trade-issues/feed/",
    "https://www.state.gov/rss-feed/europe-and-eurasia/feed/",
    "https://www.state.gov/rss-feed/international-expositions/feed/",
    "https://www.state.gov/rss-feed/international-health-issues/feed/",
    "https://www.state.gov/rss-feed/international-organizations/feed/",
    "https://www.state.gov/rss-feed/law-enforcement-narcotics-anti-corruption/feed/",
    "https://www.state.gov/rss-feed/near-east/feed/",
    "https://www.state.gov/rss-feed/population-refugees-and-migration/feed/",
    "https://www.state.gov/rss-feed/public-schedule/feed/",
    "https://www.state.gov/rss-feed/south-and-central-asia/feed/",
    "https://www.state.gov/rss-feed/trafficking-in-persons/feed/",
    "https://www.state.gov/rss-feed/treaties-new/feed/",
    "https://www.state.gov/rss-feed/western-hemisphere/feed/",
    "https://www.state.gov/rss-feed/womens-issues/feed/",
]

# Some sites verify recent user_agent configurations
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"

# This RSS Feed keeps a fair amount of historic data that we want to filter out.
date:str = None
days_history = 9999    # it seems like forever...

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# level of unnecessary output
verbose = True

## Process Parameters
Parameters are passed in as strings that sometimes need to be converted to other object format.


In [None]:
# Set the Data Filtering

from datetime import datetime, timedelta
import pytz

# Default to today
today = datetime.now()
if date is not None: 
    today = datetime.strptime(date,"%Y-%m-%d")

# defaults to previous 1 day
last_read = today - timedelta(days=days_history)

print("Today is {}, will atttempt to download content since {}.".format(today.strftime('%Y-%m-%d'),last_read.strftime('%Y-%m-%d')))

In [None]:
# Verify where the .json files should be stored.
if live_data_repo_path is None:
    live_data_repo_path = os.path.join("..","Data","Live")

if organize_by_source:
    live_data_repo_path= os.path.join(live_data_repo_path,rss_source)

print ("Writing Content to: {}".format(live_data_repo_path))

# Verify where the media files should be stored.
if media_cache_path is None and download_media:
    media_cache_path = os.path.join('..','Data','MediaCache')

if download_media:
    print ("Cacheing Media files to: {}".format(media_cache_path))


In [None]:
# Ensure that these paths exist and create them if they do not

if not os.path.exists(live_data_repo_path):
    print ("Creating directory: {}".format(live_data_repo_path))
    os.makedirs(live_data_repo_path)

if download_media and not os.path.exists(media_cache_path):
    print ("Creating directory: {}".format(media_cache_path))
    os.makedirs(media_cache_path)

# Process State RSS Feed
This collector pulls entries from the RSS Feed and structures them for OpenTLDR Ingest.

In [None]:
reader=RssFeed(rss_source, user_agent=user_agent)
reader.set_filter(today=today, earliest=last_read)
reader.set_log_level(logging_level)
#reader.archive = os.path.join(live_data_repo_path,"rss_archive",datetime.now().strftime("%Y-%m-%d"))

writer=RepoWriter(live_data_repo_path)
writer.set_log_level(logging_level)

# DeptOfState uses full html as their description, so it includes some htmls markups
htmlish_tags= re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

for feed in rss_feed_urls:
    try:
        for entry in reader.fetch_feed(feed, type = "press release"):

            # Clean up source-specific issues
            entry['text'] = re.sub(htmlish_tags,' ', entry['text'])
            entry['author'] = "US Dept of State"
            source_html_url = entry['url']

            # Write out a content node 
            repo_uid = writer.write_content(entry)
    
            if repo_uid is not None and download_media:
                try:
                    for m in entry['metadata']['media']:
                        media_url:str = m
                        media_filename:str = writer.url_to_filename(m)
                        media_full_path = os.path.join(media_cache_path,media_filename)

                        if not os.path.exists(media_full_path):                
                            print ("Downloading media file {} for {}".format(media_full_path,repo_uid))
                            writer.http_copy(media_url, media_full_path)
                        else:
                            print ("Already have media file {} for {}".format(media_full_path,repo_uid))

                    # Dept of State puts a clean version of their entire text content in the rss feed.
                    # They are also pretty strict about blocking requests, so best to just not download media unless you need to.

                except:
                    print ("Failed to download media file for {}.".format(hash))
    
    except Exception as e:
        print ("RSS Feed {} had error: {}".format(feed, e))     