In [49]:
from pathlib import Path
from pprint import pprint

import pandas as pd
from newspaper import Article
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

In [50]:
# data directory
data_dir = Path("../data")

input_dir = data_dir / "input"
raw_dir = data_dir / "raw"
output_dir = data_dir / "processed"
sample_dir = data_dir / "samples"

rss_dir = raw_dir / "rss"
twitter_dir = raw_dir / "twitter"

## Load all URLs

In [51]:
rss_urls = pd.read_csv(raw_dir / "cleaned_rss.csv")
tweet_urls = pd.read_csv(raw_dir / "cleaned_tweets.csv")

In [52]:
urls = rss_urls[["source", "channel", "url"]]
urls = urls.append(tweet_urls[["source", "channel", "url"]])
# urls["url_section"] = None
urls = urls.reset_index(drop=True)
urls.index.name = "url_id"
urls

Unnamed: 0_level_0,source,channel,url
url_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,medpage,rss,https://www.medpagetoday.com/infectiousdisease...
1,medpage,rss,https://www.medpagetoday.com/special-reports/e...
2,medpage,rss,https://www.medpagetoday.com/infectiousdisease...
3,medpage,rss,https://www.medpagetoday.com/pulmonology/gener...
4,medpage,rss,https://www.medpagetoday.com/infectiousdisease...
...,...,...,...
7120,popsci,twitter,https://www.popsci.com/story/diy/how-to-use-an...
7121,popsci,twitter,https://www.popsci.com/how-to-make-perfect-pie...
7122,popsci,twitter,https://www.popsci.com/story/technology/airbus...
7123,popsci,twitter,https://www.popsci.com/story/gear/deer-birds-w...


## Download and parse articles

In [53]:
def get_published(article, source):
    published = ""
    if source in ["guardian", "nyt", "newsmed", "healthday", "popsci"]:
        published = article.meta_data["article"]["published_time"]
    elif source in ["medpage"]:
        published = article.meta_data["dc.date"]
    elif source == "ifls":
        soup = BeautifulSoup(article.html, 'lxml')
        published = soup.find_all("span", class_="date")[0].string
    elif source == "wired":
        soup = BeautifulSoup(article.html, 'lxml')
        published = soup.find_all("time", attrs={"data-testid":"ContentHeaderPublishDate"})[0].string
    return pd.to_datetime(published).isoformat()

def get_modified(article, source):
    modified = ""
    if source in ["guardian", "nyt", "newsmed", "healthday", "popsci"]:
        modified = article.meta_data["article"]["modified_time"]
    return pd.to_datetime(modified).isoformat()

def get_section(article, source):
    section = ""
    if source in ["guardian", "nyt", "healthday"]:
        section = article.meta_data["article"]["section"]
    elif source == "popsci":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all("ul", class_="article-categories")[0].find_all("a")[0].string
    elif source == "wired":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all("a", class_="rubric__link")[0].span.string
    elif source == "ifls":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all("a", class_="category")[0].string
    elif source == "medpage":
        section = article.meta_data["sailthru.topcat"]
    elif source == "newsmed":
        soup = BeautifulSoup(article.html, 'lxml')
        section = soup.find_all(class_="active-site-sections-menu-btn")[0].string
        section = section.split(" Home")[0]
        
    if len(section) == 0:
        section = ""
    return section.strip()

def get_keywords(article, source):
    keywords = ""
    if source in ["guardian", "nyt", "wired", "newsmed", "medpage"]:
        if "news_keywords" in article.meta_data:
            keywords = article.meta_data["news_keywords"]
        elif "keywords" in article.meta_data:
            keywords = article.meta_data["keywords"]
        elif "tag" in article.meta_data["article"]:
            keywords = article.meta_data["article"]["tag"]
    elif source == "popsci":
        soup = BeautifulSoup(article.html, 'lxml')
        keywords = soup.find_all("div", class_="tag-item")
        keywords = [tag.a.string.strip() for tag in keywords]
        keywords = ", ".join(keywords)
    elif source in ["ifls", "healthday"]:
        article.nlp()
        keywords = ", ".join(article.keywords)
    if len(keywords) == 0:
        keywords = ""
    return keywords
    
def get_authors(article, source):
    authors = ""
    if source in ["guardian", "nyt", "popsci"]:
        authors = ", ".join(article.authors)
    elif source == "wired":
        authors = article.meta_data["author"]
    elif source == "ifls":
        soup = BeautifulSoup(article.html, 'lxml')
        authors = soup.find_all("div", class_="author")[0]
        authors = authors.find_all(class_="name")
        if len(authors) > 0:
            authors = authors[0].text.split("By ")[-1]
        else:
            authors = ""
    elif source == "newsmed":
        soup = BeautifulSoup(article.html, 'lxml')
        authors = soup.find_all("span", attrs={"itemprop":"name"})
        if len(authors) > 0:
            authors = authors[0].string
        else:
            authors = ""
    elif source == "medpage":
        authors = article.meta_data["sailthru.author"]
    elif source == "healthday":
        soup = BeautifulSoup(article.html, 'lxml')
        authors = soup.find_all("a", class_="post-author__name")[0].string
        
    if len(authors) == 0:
        authors = ""
    return authors

In [62]:
domains = {
    "guardian": "theguardian",
    "healthday": "healthday",
    "ifls": "iflscience",
    "medpage": "medpagetoday",
    "newsmed": "news-med",
    "nyt": "nytimes",
    "popsci": "popsci",
    "wired": "wired"
}

def process_url(url, source):
    # Init metadata
    meta = {
        "published": "",
        "modified": "",
        "section": "",
        "keywords": "",
        "authors": "",
        "title": "",
        "canonical_url": "",
        "success": True,
        "error_msg": ""
    }
        
    # Attempt to parse content
    try:
        article = Article(url)
        article.download()
        article.parse()
    except Exception as e:
        meta["success"] = False
        meta["error_msg"] = str(e)
        return meta
    
    # use canonical link to determine if link is not from source
    if domains[source] not in article.canonical_link:
        meta["success"] = False
        meta["error_msg"] = "URL with wrong domain"
        return meta

    meta["title"] = article.title
    meta["canonical_url"] = article.canonical_link
    
    try:
        meta["published"] = get_published(article, source)
    except:
        meta["published"] = "parse_error"
    
    try:
        meta["modified"] = get_modified(article, source)
    except:
        meta["modified"] = "parse_error"
    
    try:
        meta["section"] = get_section(article, source)
    except:
        meta["section"] = "parse_error"
    
    try:
        meta["keywords"] = get_keywords(article, source)
    except:
        meta["keywords"] = "parse_error"
    
    try:
        meta["authors"] = get_authors(article, source)
    except:
        meta["authors"] = "parse_error"
    
    return meta

In [69]:
outcols = [
    "published",
    "modified",
    "section",
    "keywords",
    "authors",
    "title",
    "canonical_url",
    "success",
    "msg",
]
outcols = list(urls.columns) + outcols
    
outfile = output_dir / "articles.csv"
if outfile.exists():
    outputs = pd.read_csv(outfile, index_col="url_id")
else:
    outputs = urls.copy()
    outputs = outputs.reindex(columns=outcols, fill_value=None)

In [71]:
df = missing_urls = outputs[outputs.success != True]
df = df.sample(frac=1)
total = len(df)

for ix, row in tqdm(df.iterrows(), total=total):
    try:
        meta = process_url(row.url, row.source)
    except Exception as e:
        print(row.url, row.source)
        print(traceback.format_exc())
        continue

    for k, v in meta.items():
        outputs.loc[ix, k] = v

  0%|          | 0/612 [00:00<?, ?it/s]

In [82]:
outputs[(outputs.source=="ifls")&(outputs.published.notna())]

Unnamed: 0_level_0,source,channel,url,published,modified,section,keywords,authors,title,canonical_url,success,error_msg
url_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3744,ifls,twitter,https://www.iflscience.com/space/breaking-mars...,2021-04-30T17:32:00,NaT,Space,"trip, perseverance, mars, week, survive, scien...",Dr Alfredo Carpineti,Breaking: Mars Helicopter Is Now A Fully Opera...,https://www.iflscience.com/space/breaking-mars...,True,
3745,ifls,twitter,https://www.iflscience.com/health-and-medicine...,2021-04-22T12:42:00,NaT,Health and Medicine,"entire, troop, speed, ski, finnish, good, russ...",James Felton,The Finnish Soldier Who Took His Entire Troop'...,https://www.iflscience.com/health-and-medicine...,True,
3746,ifls,twitter,https://www.iflscience.com/physics/the-better-...,2021-04-30T17:19:00,NaT,Physics,"clock, accuracy, clocks, vibration, relationsh...",Dr Alfredo Carpineti,The Better The Clock The More Entropy It Might...,https://www.iflscience.com/physics/the-better-...,True,
3747,ifls,twitter,https://www.iflscience.com/editors-blog/this-i...,2021-04-30T17:04:00,NaT,Editor's Blog,"fact, perforated, way, paper, hang, toilet, ac...",Dr Alfredo Carpineti,This Is How You Should Hang Your Toilet Paper ...,https://www.iflscience.com/editors-blog/this-i...,True,
3748,ifls,twitter,https://www.iflscience.com/health-and-medicine...,2021-04-30T17:06:00,NaT,Health and Medicine,"harmful, fermented, decompose, months, left, r...",Jack Dunhill,"Some People Are Eating Raw, Rotten ""High Meat""...",https://www.iflscience.com/health-and-medicine...,True,
...,...,...,...,...,...,...,...,...,...,...,...,...
4326,ifls,twitter,https://www.iflscience.com/chemistry/seafloor-...,2021-02-26T16:40:00,NaT,Chemistry,"products, sediment, hydrogen, microbes, sauvag...",Stephen Luntz,Seafloor Microbes Are Living On The Products O...,https://www.iflscience.com/chemistry/seafloor-...,True,
4327,ifls,twitter,https://www.iflscience.com/editors-blog/cats-d...,2021-03-01T13:13:00,NaT,Editor's Blog,"pets, site, monkeys, pet, study, unearthed, ca...",Benjamin Taub,"Cats, Dogs, And Monkeys Unearthed At Ancient E...",https://www.iflscience.com/editors-blog/cats-d...,True,
4328,ifls,twitter,https://www.iflscience.com/plants-and-animals/...,2021-03-01T14:43:00,NaT,Plants and Animals,"cleverly, forest, survive, embrace, freezing, ...",Katy Evans,Alligators In Oklahoma Cleverly Embrace Becomi...,https://www.iflscience.com/plants-and-animals/...,True,
4329,ifls,twitter,https://www.iflscience.com/physics/scientists-...,2021-02-26T17:27:00,NaT,Physics,"space, technologies, crystal, able, structure,...",Dr Alfredo Carpineti,Watch The First Ever Video Of A Space-Time Cry...,https://www.iflscience.com/physics/scientists-...,True,


In [84]:
outputs = outputs.replace("", None)
outputs.to_csv(outfile)

In [158]:
# sample = outputs[outputs.success].groupby("source", as_index=False).apply(lambda x: x.sample(10)).reset_index(drop=True)
# sample.index.name = "id"
# sample.to_csv(sample_dir / "all_venues_sample_05052021.csv")