# ISW Collector for OpenTLDR (Update)
This notebook pulls raw data from the internet and formats it for OpenTLDR ingest as a .json file.
The output json file can be shared with multiple OpenTLDR instances as a file system or S3 bucket.

In [None]:

import os
import logging
import re

from dotenv import load_dotenv
load_dotenv()

from RssFeed import RssFeed
from RepoWriter import RepoWriter
from HtmlScrape import fetch_html, extract_text_from_element, url_to_filename

In [None]:
# Parameters

# The path where this agent should write the .json files for content
live_data_repo_path = os.getenv('LIVE_DATA_REPO_PATH')

# The path where this agent should write full source files temporarily
download_media = False
media_cache_path = os.getenv('MEDIA_CACHE_PATH')

# Name of Source to use in OpenTLDR objects
rss_source = "isw"
organize_by_source = True

# URLs for the feeds to scrape for this Source
rss_feed_urls = [
    "https://www.understandingwar.org/feed-afghanistan.xml",
    "https://www.understandingwar.org/feeds-iraq.xml",
]

# Filters - excludes some content
date:str = None
days_history = 4000 # Some older ISW entries have errors in the formatting...

# Some sites verify recent user_agent configurations
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# level of unnecessary output
verbose = True

## Process Parameters
Parameters are passed in as strings that sometimes need to be converted to other object format.



In [None]:
# Set the Data Filtering
from datetime import datetime, timedelta
import pytz

# Default to today
today = datetime.now()
if date is not None: 
    today = datetime.strptime(date,"%Y-%m-%d")

# defaults to previous 1 day
last_read = today - timedelta(days=days_history)

print("Today is {}, will atttempt to download content since {}.".format(today.strftime('%Y-%m-%d'),last_read.strftime('%Y-%m-%d')))

In [None]:
# Verify where the .json files should be stored.
if live_data_repo_path is None:
    live_data_repo_path = os.path.join("..","Data","Live")

if organize_by_source:
    live_data_repo_path= os.path.join(live_data_repo_path,rss_source)

print ("Writing Content to: {}".format(live_data_repo_path))

# Verify where the media files should be stored.
if media_cache_path is None and download_media:
    media_cache_path = os.path.join('..','Data','MediaCache')

if download_media:
    print ("Cacheing Media files to: {}".format(media_cache_path))


In [None]:
# Ensure that these paths exist and create them if they do not

if not os.path.exists(live_data_repo_path):
    print ("Creating directory: {}".format(live_data_repo_path))
    os.makedirs(live_data_repo_path)

if download_media and not os.path.exists(media_cache_path):
    print ("Creating directory: {}".format(media_cache_path))
    os.makedirs(media_cache_path)

# Process ISW RSS Feed
This collector pulls entries from the RSS Feed and structures them for OpenTLDR Ingest.

In [None]:
reader=RssFeed(rss_source, user_agent=user_agent)
reader.set_filter(today=today, earliest=last_read)
reader.set_log_level(logging_level)
#reader.archive = os.path.join(live_data_repo_path,"rss_archive",datetime.now().strftime("%Y-%m-%d"))

writer=RepoWriter(live_data_repo_path)
writer.set_log_level(logging_level)

# ISW uses full html as their description, so it includes some htmls markups
htmlish_tags= re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

# For Debugging
reader.set_log_level(logging.INFO)
writer.set_log_level(logging.INFO)

for feed in rss_feed_urls:
    try:
        for entry in reader.fetch_feed(feed, type = "reports"):

            # Clean up source-specific issues
            entry['text'] = re.sub(htmlish_tags,' ', entry['text'])
            source_html_url = entry['url']

            # Write out a content node 
            repo_uid = writer.write_content(entry)

            # Download Full Media into raw folder
            if repo_uid is not None and download_media:
                try:

                    html_media_path = os.path.join(media_cache_path,"{}.html".format(repo_uid))
                    writer.http_copy(source_html_url, html_media_path)

                except Exception as e:
                    print ("Failed to download html file for {}: {}".format(repo_uid,e))    

                try:
                    for m in entry['metadata']['media']:
                        media_url:str = m
                        media_filename:str = writer.url_to_filename(m)
                        media_full_path = os.path.join(media_cache_path,media_filename)

                        if not os.path.exists(media_full_path):                
                            print ("Downloading media file {} for {}".format(media_full_path,repo_uid))
                            writer.http_copy(media_url, media_full_path)
                        else:
                            print ("Already have media file {} for {}".format(media_full_path,repo_uid))

                except Exception as e:
                    print ("Failed to download media file for {}: {}".format(repo_uid,e))
    
    except Exception as e:
        print ("RSS Feed {} had error: {}".format(feed, e))