# NPR Collector for OpenTLDR (Update)
This notebook pulls raw data from the internet and formats it for OpenTLDR ingest as a .json file.
The output json file can be shared with multiple OpenTLDR instances as a file system or S3 bucket.


In [None]:

import os
import logging

from dotenv import load_dotenv
load_dotenv()

#from RssFeed import RssFeed
from RepoWriter import RepoWriter


## Parameters
Parameters can be passed into the notebook using OpenTLDR Workflows for automation.
The values set in the cell below are set as defaults for this collector, if not overridden by the Workflow.

In [None]:
# Parameters

# The path where this agent should write the .json files for content
live_data_repo_path = os.getenv('LIVE_DATA_REPO_PATH')

# The path where this agent should write full source files temporarily
download_media = False          # NPR already loads full text by default
media_cache_path = os.getenv('MEDIA_CACHE_PATH')

# Name of Source to use in OpenTLDR objects
source = "npr"
organize_by_source = True

source_url = "https://text.npr.org"

pages_urls = [
    "https://text.npr.org",         # Main Page
    "https://text.npr.org/1001",    # News
    "https://text.npr.org/1008",    # Culture
    "https://text.npr.org/1039",    # Music
]

# Some sites verify recent user_agent configurations
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"

# Filters - excludes some content
date:str = None
days_history = 9999

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# level of unnecessary output
verbose = True


## Process Parameters
Parameters are passed in as strings that sometimes need to be converted to other object format.


In [None]:
# Set the Data Filtering

from datetime import datetime, timedelta
import pytz

# Default to today
today = datetime.now()
if date is not None: 
    today = datetime.strptime(date,"%Y-%m-%d")

# defaults to previous 1 day
last_read = today - timedelta(days=days_history)

print("Today is {}, will atttempt to download content since {}.".format(today.strftime('%Y-%m-%d'),last_read.strftime('%Y-%m-%d')))

In [None]:
# Verify where the .json files should be stored.
if live_data_repo_path is None:
    live_data_repo_path = os.path.join("..","Data","Live")

if organize_by_source:
    live_data_repo_path= os.path.join(live_data_repo_path,source)

print ("Writing Content to: {}".format(live_data_repo_path))

# Verify where the media files should be stored.
if media_cache_path is None and download_media:
    media_cache_path = os.path.join('..','Data','MediaCache')

if download_media:
    print ("Cacheing Media files to: {}".format(media_cache_path))


In [None]:
# Ensure that these paths exist and create them if they do not

if not os.path.exists(live_data_repo_path):
    print ("Creating directory: {}".format(live_data_repo_path))
    os.makedirs(live_data_repo_path)

if download_media and not os.path.exists(media_cache_path):
    print ("Creating directory: {}".format(media_cache_path))
    os.makedirs(media_cache_path)

## Code for pulling and processing html

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def fetch_html(url:str) -> str:
    request=requests.get(url)
    return request.content

def process_text(in_html:str):
    soup = BeautifulSoup(in_html,"html.parser")

    author = "unknown"
    date = datetime.now()

    title_block = soup.find('h1', attrs={"class":"story-title"}, string=True)
    if title_block is not None:
        title =title_block.get_text()
        #print ("Got title: {}".format(title))

    else:
        title = None

    header_div = soup.find('div', attrs={"class":"story-head"}).find_all('p', string=True)
    
    for header_part in header_div:
        line = header_part.get_text()
        if line.startswith("By "):
            author = line[3:]
            #print ("Got author: {}".format(author))

        else:
            try:
                start_date_index = line.index(", ")+2
                end_date_index = line.index("•")-1
                date_string = line[start_date_index : end_date_index]
                date = datetime.strptime(date_string, "%B %d, %Y")
                #print ("Got date: {}".format(date.strftime("%Y-%m-%d")))
            except:
                #print ("Not a date? >{}<".format(line))
                pass

    npr_text_divs = soup.find('div', attrs={"class":"paragraphs-container"})
    all_text = npr_text_divs.find_all('p', string=True)
    output = ''

    for text_part in all_text:
        output = '{} \n {}'.format(output, text_part.get_text())

    #print ("Got text: {}".format(output))

    return title, author, date.strftime('%Y-%m-%d'), output

### NPR Text Website Processing
This site has several top level pages with a list of articles, each is a link to an actual article.
This code block reads the pages, iterates the list, and reads the each article.
Content nodes are created for each article.

In [None]:
writer=RepoWriter(live_data_repo_path)
writer.set_log_level(logging_level)

for page in pages_urls: 
    try:
        # Download Topic Page with links to specific articles
        main_page=fetch_html(page)
        soup = BeautifulSoup(main_page,"html.parser")
        date_html = soup.find('p', attrs={'class':'topic-date'})

        # Site appears to have 2 day delay of articles
        entry_date= datetime.strptime(date_html.text, "%A, %B %d, %Y")         
        #entry_date = datetime.now()

        # Iterate the list of entries (links to articles)
        for article in soup.find_all('a', href=True, attrs={'class':'topic-title'}):
            
            try:
                # Extract Content information
                expected_title = article.text
                link = "{main}{rel}".format(main=source_url, rel=article["href"])

                if verbose:
                    print("Fetching article: {}".format(link))

                source_html = fetch_html(link)
                title, author, article_date, text = process_text(source_html)

                # sometimes the article format is odd
                if title is None:
                    title=expected_title

                if article_date is None:
                    article_date = entry_date.strftime("%Y-%m-%d")               

                entry:dict = {
                    "title": title,
                    "date": article_date,
                    "type": "news",
                    "author": author,
                    "source": source,
                    "url": link,
                    "text": text,
                    "metadata": {},
                }

                # Write out a content node only if its in the date range we want
                article_date_obj = datetime.strptime(article_date,"%Y-%m-%d")
                
                repo_uid = None
                if article_date_obj > last_read and article_date_obj <= today:
                    repo_uid = writer.write_content(entry)

                # Some entries are skipped (hash == None), otherwise store the html and any media found
                # Note NPR uses text parsing of page - so this source html file is already downloaded at this point.
                if repo_uid is not None and download_media:
                    try:
                        if not os.path.exists(media_cache_path):
                            os.makedirs(media_cache_path)

                        html_media_path = os.path.join(media_cache_path,"{}.html".format(repo_uid))

                        with open(html_media_path,"wb") as f:
                            f.write(source_html)
                    except Exception as e:
                        print ("Failed to download media files for {}: {}".format(repo_uid, e))
            except Exception as e:
                print ("Failed to process article for {}: {}".format(link, e))    
            
    except Exception as e:
        print ("Page {} had error: {}".format(page, e))