In [1]:
from pathlib import Path
from pprint import pprint

import pandas as pd
from newspaper import Article
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

In [2]:
# data directory
data_dir = Path("../data")

input_dir = data_dir / "input"
raw_dir = data_dir / "raw"
output_dir = data_dir / "processed"

rss_dir = raw_dir / "rss"
twitter_dir = raw_dir / "twitter"

## Load all URLs

In [3]:
rss_urls = pd.read_csv(raw_dir / "cleaned_rss.csv")
tweet_urls = pd.read_csv(raw_dir / "cleaned_tweets.csv")

In [4]:
urls = rss_urls[["source", "channel", "url"]]
urls = urls.append(tweet_urls[["source", "channel", "url"]])
# urls["url_section"] = None
urls = urls.reset_index(drop=True)
urls.index.name = "url_id"
urls

Unnamed: 0_level_0,source,channel,url
url_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,medpage,rss,https://www.medpagetoday.com/infectiousdisease...
1,medpage,rss,https://www.medpagetoday.com/special-reports/e...
2,medpage,rss,https://www.medpagetoday.com/infectiousdisease...
3,medpage,rss,https://www.medpagetoday.com/pulmonology/gener...
4,medpage,rss,https://www.medpagetoday.com/infectiousdisease...
...,...,...,...
7120,popsci,twitter,https://www.popsci.com/story/diy/how-to-use-an...
7121,popsci,twitter,https://www.popsci.com/how-to-make-perfect-pie...
7122,popsci,twitter,https://www.popsci.com/story/technology/airbus...
7123,popsci,twitter,https://www.popsci.com/story/gear/deer-birds-w...


## Download and parse articles

In [44]:
def get_published(article, source):
    published = ""
    if source in ["guardian", "nyt", "newsmed", "healthday", "popsci"]:
        published = article.meta_data["article"]["published_time"]
    elif source in ["medpage"]:
        published = article.meta_data["dc.date"]
    elif source == "ifls":
        soup = BeautifulSoup(article.html, 'lxml')
        published = soup.find_all("span", class_="date")[0].a.string
    elif source == "wired":
        soup = BeautifulSoup(article.html, 'lxml')
        published = soup.find_all("time", attrs={"data-testid":"ContentHeaderPublishDate"})[0].string
    return pd.to_datetime(published).isoformat()

def get_modified(article, source):
    modified = ""
    if source in ["guardian", "nyt", "newsmed", "healthday", "popsci"]:
        modified = article.meta_data["article"]["modified_time"]
    return pd.to_datetime(modified).isoformat()

def get_section(article, source):
    section = ""
    if source in ["guardian", "nyt", "healthday"]:
        section = article.meta_data["article"]["section"]
    elif source == "popsci":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all("ul", class_="article-categories")[0].find_all("a")[0].string
    elif source == "wired":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all("a", class_="rubric__link")[0].span.string
    elif source == "ifls":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all("a", class_="category")[0].string
    elif source == "medpage":
        section = article.meta_data["sailthru.topcat"]
    elif source == "newsmed":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all(class_="active-site-sections-menu-btn")[0].string
        section = section.split(" Home")[0]
        
    if len(section) == 0:
        section = ""
    return section.strip()

def get_keywords(article, source):
    keywords = ""
    if source in ["guardian", "nyt", "wired", "newsmed", "medpage"]:
        if "news_keywords" in article.meta_data:
            keywords = article.meta_data["news_keywords"]
        elif "keywords" in article.meta_data:
            keywords = article.meta_data["keywords"]
        elif "tag" in article.meta_data["article"]:
            keywords = article.meta_data["article"]["tag"]
    elif source == "popsci":
        soup = BeautifulSoup(article.html, 'lxml')
        keywords = soup.find_all("div", class_="tag-item")
        keywords = [tag.a.string.strip() for tag in keywords]
        keywords = ", ".join(keywords)
    elif source in ["ifls", "healthday"]:
        article.nlp()
        keywords = ", ".join(article.keywords)
    
    if len(keywords) == 0:
        keywords = ""
    return keywords
    
def get_authors(article, source):
    authors = ""
    if source in ["guardian", "nyt", "popsci"]:
        authors = ", ".join(article.authors)
    elif source == "wired":
        authors = article.meta_data["author"]
    elif source == "ifls":
        soup = BeautifulSoup(article.html, 'lxml')
        authors = soup.find_all("div", class_="author")[0]
        authors = authors.find_all(class_="name")[0].a.string
        authors = authors.split("By ")[-1]
    elif source == "newsmed":
        soup = BeautifulSoup(article.html, 'lxml')
        authors = soup.find_all("span", attrs={"itemprop":"name"})[0].string
    elif source == "medpage":
        authors = article.meta_data["sailthru.author"]
    elif source == "healthday":
        soup = BeautifulSoup(article.html, 'lxml')
        authors = soup.find_all("a", class_="post-author__name")[0].string
        
    if len(authors) == 0:
        authors = ""
    return authors

In [47]:
def process_url(url, source):
    # Init metadata
    meta = {
        "published": "",
        "modified": "",
        "section": "",
        "keywords": "",
        "authors": "",
        "title": "",
        "canonical_url": "",
        "newspaper_success": True,
        "newspaper_msg": ""
    }
    
    # Attempt to parse content
    try:
        article = Article(url)
        article.download()
        article.parse()
    except e:
        meta["newspaper_success"] = False
        meta["newspaper_msg"] = str(e)
        return meta

    meta["title"] = article.title
    meta["canonical_url"] = article.canonical_link
    
    meta["published"] = get_published(article, source)
    meta["modified"] = get_modified(article, source)
    meta["section"] = get_section(article, source)
    meta["keywords"] = get_keywords(article, source)
    meta["authors"] = get_authors(article, source)
    
    return meta

def debug_sources(source, nlp=False):
    df = urls[urls.source==source]
    url = df.sample().iloc[0].url
    
    print(f"=== {source} ===")
    print(url)
    a = Article(url)
    a.download()
    a.parse()
    
    out = {}
    out["canonical_link"] = a.canonical_link
    out["authors"] = a.authors
    out["title"] = a.title
    pprint(out)
    
    pprint(a.meta_data)
    
    return a

In [48]:
outcols = [
    "published",
    "modified",
    "section",
    "keywords",
    "authors",
    "title",
    "canonical_url",
    "newspaper_success",
    "newspaper_msg",
]
outcols = list(urls.columns) + outcols
    
outfile = output_dir / "articles.csv"
if outfile.exists():
    outputs = pd.read_csv(outfile, index_col="url_id")
else:
    outputs = urls.copy()
    outputs = outputs.reindex(columns=outcols, fill_value=None)

In [None]:
df = missing_urls = outputs[outputs.newspaper_success != True]
df = df.sample(frac=1)
total = len(df)

for ix, row in tqdm(df.iterrows(), total=total):
    try:
        row = process_url(row.url, row.source)
    except:
        continue

    for k, v in row.items(): 
        outputs.loc[ix, k] = v

  0%|          | 0/5232 [00:00<?, ?it/s]

In [24]:
outputs[outputs.published.notna()].groupby("source").section.value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,section
source,section,Unnamed: 2_level_1
guardian,Science,45
guardian,World news,31
guardian,Opinion,13
guardian,Environment,12
guardian,Society,8
...,...,...
popsci,Ask Us Anything,1
popsci,Aviation,1
popsci,Blogs,1
wired,Science,34


## Process canoncical URLs to extract section information

In [99]:
urls.loc[urls.source=="popsci", "url_section"] = urls[urls.source=="popsci"].url.map(lambda x: x.split("/")[3])

IndexError: list index out of range