In [16]:
import csv
import datetime
from pathlib import Path
from pprint import pprint

import feedparser
import pandas as pd

In [19]:
# data directory
data_dir = Path("../data")
collection_dir = data_dir / "collection"

# soad news sources
news_sources = pd.read_csv(data_dir / "news_sources.csv")

venue_names = news_sources["short_name"].tolist()
feed_urls = news_sources["feed_url"].tolist()

In [31]:
class LogWriter:
    """
    Helper class to manage and write the scraping log
    """
    def __init__(self, dir):
        self.file = dir / "log.csv"
        self.fieldnames = ["timestamp", "venue", "new_articles", "error"]

        self.file_exists = self.file.exists()

    def update(self, ts, venue, new_articles, error_msg):
        with open(self.file, mode="a") as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=self.fieldnames)

            if not self.file_exists:
                writer.writeheader()

            writer.writerow(
                {
                    "timestamp": ts,
                    "venue": venue,
                    "new_articles": new_articles,
                    "error": error_msg,
                }
            )

In [32]:
x = 3
venue = venue_names[x]
feed_url = feed_urls[x]

In [47]:
feed = feedparser.parse(feed_url)

In [93]:
articles = pd.DataFrame(feed["entries"])
articles.published = pd.to_datetime(articles.published)
articles.dtypes

title                            object
title_detail                     object
links                            object
link                             object
summary                          object
summary_detail                   object
id                               object
guidislink                         bool
authors                          object
author                           object
author_detail                    object
tags                             object
published           datetime64[ns, UTC]
published_parsed                 object
content                          object
dtype: object

In [105]:
# save oldest 20


In [None]:
def save_articles(df, venue):
    df = df.sort_values("published")
    filename = collection_dir / f"{venue}.jsonl"
    df.to_json(filename, orient="records", lines=True, date_format="iso")

In [None]:
def write_log(old, new, log):
    new[new['published']>max(old['published'])]

In [107]:
articles_old = pd.read_json(collection_dir / f"{venue}.jsonl", orient="records", lines=True)
articles_old.published = pd.to_datetime(articles_old.published)

In [115]:
new_articles = articles[articles['published']>max(articles_old['published'])]
new_articles.sort_values("published")

Unnamed: 0,title,title_detail,links,link,summary,summary_detail,id,guidislink,authors,author,author_detail,tags,published,published_parsed,content
6,Best adjustable desks: Stand or sit with doubl...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-adju...,Buy the best adjustable desk and never look ba...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-adju...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 16:59:00+00:00,"(2021, 2, 26, 16, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
7,Best ergonomic keyboards for hand and wrist pain,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-ergo...,Buy the best ergonomic keyboard & save your wr...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-ergo...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 16:59:00+00:00,"(2021, 2, 26, 16, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
5,What underwater sounds can tell us about the s...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/environment/under...,Scientists are studying how the sound of an un...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/environment/under...,False,[{'name': 'Ellie Shechet'}],Ellie Shechet,{'name': 'Ellie Shechet'},"[{'term': 'Environment', 'scheme': None, 'labe...",2021-02-26 18:05:08+00:00,"(2021, 2, 26, 18, 5, 8, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
4,Phone anxiety is real—and solvable,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/health/phone-anxi...,Don't resort to texts for all your conversatio...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/health/phone-anxi...,False,[{'name': 'By Ilham Sebah/The Conversation'}],By Ilham Sebah/The Conversation,{'name': 'By Ilham Sebah/The Conversation'},"[{'term': 'Health', 'scheme': None, 'label': N...",2021-02-26 19:16:38+00:00,"(2021, 2, 26, 19, 16, 38, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
13,Thrift shopping is an environmental and ethica...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/environment/thrif...,There are ways to go clothes shopping and star...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/environment/thrif...,False,[{'name': 'Sara Kiley Watson'}],Sara Kiley Watson,{'name': 'Sara Kiley Watson'},"[{'term': 'Environment', 'scheme': None, 'labe...",2021-02-26 19:18:11+00:00,"(2021, 2, 26, 19, 18, 11, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
24,Why spacing out is good for you,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/health/head-trip-...,"A wandering mind can boost creativity, reduce ...","{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/health/head-trip-...,False,[{'name': 'Eleanor Cummins'}],Eleanor Cummins,{'name': 'Eleanor Cummins'},"[{'term': 'Health', 'scheme': None, 'label': N...",2021-02-26 19:19:51+00:00,"(2021, 2, 26, 19, 19, 51, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
3,Best ball chair: Improve your posture and stre...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-ball...,Get the best ball chair for your home office. ...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-ball...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 19:59:00+00:00,"(2021, 2, 26, 19, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
2,The Royal Navy’s robotic sub will be a test be...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/technology/royal-...,Companies and academics will have the chance t...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/technology/royal-...,False,[{'name': 'Kelsey D. Atherton'}],Kelsey D. Atherton,{'name': 'Kelsey D. Atherton'},"[{'term': 'Technology', 'scheme': None, 'label...",2021-02-26 21:00:00+00:00,"(2021, 2, 26, 21, 0, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
1,Label printers to help keep your business in s...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/shop/label-printers/,Using a standard home printer to make labels c...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/shop/label-printers/,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Shop', 'scheme': None, 'label': None}]",2021-02-26 21:04:40+00:00,"(2021, 2, 26, 21, 4, 40, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
0,Best toys for kids that they will love for the...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-toys...,Looking for the best birthday gifts and toys f...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-toys...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 21:59:00+00:00,"(2021, 2, 26, 21, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."


In [121]:
pd.concat([articles, articles_old]).drop_duplicates(subset=["id", "published"])

Unnamed: 0,title,title_detail,links,link,summary,summary_detail,id,guidislink,authors,author,author_detail,tags,published,published_parsed,content
0,Best toys for kids that they will love for the...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-toys...,Looking for the best birthday gifts and toys f...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-toys...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 21:59:00+00:00,"(2021, 2, 26, 21, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
1,Label printers to help keep your business in s...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/shop/label-printers/,Using a standard home printer to make labels c...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/shop/label-printers/,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Shop', 'scheme': None, 'label': None}]",2021-02-26 21:04:40+00:00,"(2021, 2, 26, 21, 4, 40, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
2,The Royal Navy’s robotic sub will be a test be...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/technology/royal-...,Companies and academics will have the chance t...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/technology/royal-...,False,[{'name': 'Kelsey D. Atherton'}],Kelsey D. Atherton,{'name': 'Kelsey D. Atherton'},"[{'term': 'Technology', 'scheme': None, 'label...",2021-02-26 21:00:00+00:00,"(2021, 2, 26, 21, 0, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
3,Best ball chair: Improve your posture and stre...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-ball...,Get the best ball chair for your home office. ...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-ball...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 19:59:00+00:00,"(2021, 2, 26, 19, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
4,Phone anxiety is real—and solvable,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/health/phone-anxi...,Don't resort to texts for all your conversatio...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/health/phone-anxi...,False,[{'name': 'By Ilham Sebah/The Conversation'}],By Ilham Sebah/The Conversation,{'name': 'By Ilham Sebah/The Conversation'},"[{'term': 'Health', 'scheme': None, 'label': N...",2021-02-26 19:16:38+00:00,"(2021, 2, 26, 19, 16, 38, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
5,What underwater sounds can tell us about the s...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/environment/under...,Scientists are studying how the sound of an un...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/environment/under...,False,[{'name': 'Ellie Shechet'}],Ellie Shechet,{'name': 'Ellie Shechet'},"[{'term': 'Environment', 'scheme': None, 'labe...",2021-02-26 18:05:08+00:00,"(2021, 2, 26, 18, 5, 8, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
6,Best adjustable desks: Stand or sit with doubl...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-adju...,Buy the best adjustable desk and never look ba...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-adju...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 16:59:00+00:00,"(2021, 2, 26, 16, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
7,Best ergonomic keyboards for hand and wrist pain,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/reviews/best-ergo...,Buy the best ergonomic keyboard & save your wr...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/reviews/best-ergo...,False,[{'name': 'PopSci Commerce Team'}],PopSci Commerce Team,{'name': 'PopSci Commerce Team'},"[{'term': 'Reviews', 'scheme': None, 'label': ...",2021-02-26 16:59:00+00:00,"(2021, 2, 26, 16, 59, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
8,Organize and accessorize your board games with...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/diy/3d-printing-b...,"Level up your board, tabletop, and card games ...","{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/diy/3d-printing-b...,False,[{'name': 'RK Pendergrass'}],RK Pendergrass,{'name': 'RK Pendergrass'},"[{'term': 'Diy', 'scheme': None, 'label': None}]",2021-02-26 16:00:00+00:00,"(2021, 2, 26, 16, 0, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."
9,Check out the most extensive map of black hole...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.popsci.com/story/space/black-hole-...,Each of these black holes is swallowing dust a...,"{'type': 'text/html', 'language': None, 'base'...",https://www.popsci.com/story/space/black-hole-...,False,[{'name': 'Erin Fennessy'}],Erin Fennessy,{'name': 'Erin Fennessy'},"[{'term': 'Space', 'scheme': None, 'label': No...",2021-02-26 15:00:00+00:00,"(2021, 2, 26, 15, 0, 0, 4, 57, 0)","[{'type': 'text/html', 'language': None, 'base..."


In [16]:
dfs = []
for venue, rss_feed in feeds.items():
    feed = feedparser.parse(rss_feed)
    articles = pd.DataFrame(feed["entries"])
    articles["venue"] = venue

    df = articles[["venue", "title", "link", "summary", "author", "published"]].copy()
    if "tags" in articles.columns:
        df["tags"] = articles["tags"]
    else:
        df["tags"] = None
    
    df.head(samples_per_source)
    dfs.append(df)

In [19]:
articles

Unnamed: 0,title,title_detail,links,link,comments,authors,author,author_detail,published,published_parsed,tags,id,guidislink,summary,summary_detail,wfw_commentrss,slash_comments,venue
0,Excluding visitors from hospitals may harm pat...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/excluding-visi...,https://scienceline.org/2021/02/excluding-visi...,[{'name': 'Lauren Leffer'}],Lauren Leffer,{'name': 'Lauren Leffer'},"Fri, 19 Feb 2021 13:00:45 +0000","(2021, 2, 19, 13, 0, 45, 4, 50, 0)","[{'term': 'Health', 'scheme': None, 'label': N...",https://scienceline.org/?p=33156,False,<p>Hospitals and staff have tried to adapt thr...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/excluding-visi...,0,scienceline
1,Soil erosion is washing farms’ phosphorus down...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/soil-erosion-w...,https://scienceline.org/2021/02/soil-erosion-w...,[{'name': 'Delger Erdenesanaa'}],Delger Erdenesanaa,{'name': 'Delger Erdenesanaa'},"Wed, 17 Feb 2021 13:00:26 +0000","(2021, 2, 17, 13, 0, 26, 2, 48, 0)","[{'term': 'Environment', 'scheme': None, 'labe...",https://scienceline.org/?p=33144,False,<p>Experts warn erosion is depleting the essen...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/soil-erosion-w...,0,scienceline
2,Novel Science: Talkin’ bout my generation ship,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/novel-science-...,https://scienceline.org/2021/02/novel-science-...,[{'name': 'Joanna Thompson'}],Joanna Thompson,{'name': 'Joanna Thompson'},"Mon, 15 Feb 2021 13:00:33 +0000","(2021, 2, 15, 13, 0, 33, 0, 46, 0)","[{'term': 'Novel Science', 'scheme': None, 'la...",https://scienceline.org/?p=33138,False,<p>Would you board a ship you could never leav...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/novel-science-...,0,scienceline
3,Beyond the mask: communicating emotions in the...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/beyond-the-mas...,https://scienceline.org/2021/02/beyond-the-mas...,[{'name': 'Lauren Leffer'}],Lauren Leffer,{'name': 'Lauren Leffer'},"Fri, 12 Feb 2021 13:00:09 +0000","(2021, 2, 12, 13, 0, 9, 4, 43, 0)","[{'term': 'Health', 'scheme': None, 'label': N...",https://scienceline.org/?p=33090,False,<p>Proper mask wearing helps us stay safe duri...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/beyond-the-mas...,0,scienceline
4,COVID-19 is amplifying abortion access dispari...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/covid-19-is-am...,https://scienceline.org/2021/02/covid-19-is-am...,[{'name': 'Lauren Leffer'}],Lauren Leffer,{'name': 'Lauren Leffer'},"Wed, 10 Feb 2021 13:00:43 +0000","(2021, 2, 10, 13, 0, 43, 2, 41, 0)","[{'term': 'Health', 'scheme': None, 'label': N...",https://scienceline.org/?p=33020,False,"<p>Depending on where you live, abortion care ...","{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/covid-19-is-am...,0,scienceline
5,When code becomes conservation,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/when-code-beco...,https://scienceline.org/2021/02/when-code-beco...,[{'name': 'Joanna Thompson'}],Joanna Thompson,{'name': 'Joanna Thompson'},"Mon, 08 Feb 2021 13:00:46 +0000","(2021, 2, 8, 13, 0, 46, 0, 39, 0)","[{'term': 'Tech', 'scheme': None, 'label': Non...",https://scienceline.org/?p=32978,False,<p>Artificial intelligence is becoming increas...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/when-code-beco...,0,scienceline
6,International students face mental health chal...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/international-...,https://scienceline.org/2021/02/international-...,[{'name': 'Huanjia Zhang'}],Huanjia Zhang,{'name': 'Huanjia Zhang'},"Fri, 05 Feb 2021 13:00:53 +0000","(2021, 2, 5, 13, 0, 53, 4, 36, 0)","[{'term': 'Health', 'scheme': None, 'label': N...",https://scienceline.org/?p=32987,False,<p>The crisis has exacerbated long-overlooked ...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/international-...,0,scienceline
7,How does recycling using enzymes work?,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/how-does-recyc...,https://scienceline.org/2021/02/how-does-recyc...,[{'name': 'Casey Crownhart'}],Casey Crownhart,{'name': 'Casey Crownhart'},"Thu, 04 Feb 2021 17:00:53 +0000","(2021, 2, 4, 17, 0, 53, 3, 35, 0)","[{'term': 'Tech', 'scheme': None, 'label': Non...",https://scienceline.org/?p=33022,False,<p>Enzymatic recycling might offer a new metho...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/how-does-recyc...,0,scienceline
8,How do astronomers work without telescopes?,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/no-telescopes-...,https://scienceline.org/2021/02/no-telescopes-...,[{'name': 'Jackie Appel'}],Jackie Appel,{'name': 'Jackie Appel'},"Wed, 03 Feb 2021 13:00:58 +0000","(2021, 2, 3, 13, 0, 58, 2, 34, 0)","[{'term': 'Space, Physics, and Math', 'scheme'...",https://scienceline.org/?p=32924,False,<p>Astronomers are finding ways to keep invest...,"{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/no-telescopes-...,0,scienceline
9,Novel Science: Steampunk and the history of pr...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://scienceline.org/2021/02/novel-science-...,https://scienceline.org/2021/02/novel-science-...,[{'name': 'Joanna Thompson'}],Joanna Thompson,{'name': 'Joanna Thompson'},"Mon, 01 Feb 2021 13:00:05 +0000","(2021, 2, 1, 13, 0, 5, 0, 32, 0)","[{'term': 'Novel Science', 'scheme': None, 'la...",https://scienceline.org/?p=32975,False,"<p>Reimagining the past, now with 40% more gea...","{'type': 'text/html', 'language': None, 'base'...",https://scienceline.org/2021/02/novel-science-...,0,scienceline


In [17]:
df = pd.concat(dfs, ignore_index=True)
df.index.name = "id"

In [18]:
df

Unnamed: 0_level_0,venue,title,link,summary,author,published,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,wired,Twinkling Black Holes Reveal an Invisible Clou...,https://www.wired.com/story/twinkling-black-ho...,Cosmic radio backlights are helping scientists...,Max G. Levy,"Wed, 24 Feb 2021 14:00:00 +0000","[{'term': 'Science', 'scheme': None, 'label': ..."
1,wired,"When the Grid Goes Down, Can a Fleet of Batter...",https://www.wired.com/story/when-the-grid-goes...,"In a power crisis, maybe the solution is a net...",Gregory Barber,"Wed, 24 Feb 2021 12:00:00 +0000","[{'term': 'Science', 'scheme': None, 'label': ..."
2,wired,What a 1900s Wildlife Survey Reveals About Cli...,https://www.wired.com/story/what-a-1900s-wildl...,"A century ago, a biologist counted California'...",Jim Morrison,"Tue, 23 Feb 2021 13:00:00 +0000","[{'term': 'Science', 'scheme': None, 'label': ..."
3,wired,How to Remember a Disaster Without Being Shatt...,https://www.wired.com/story/remember-disaster-...,Margaret McKinnon survived a midair catastroph...,Erika Hayasaki,"Tue, 23 Feb 2021 12:00:00 +0000","[{'term': 'Backchannel', 'scheme': None, 'labe..."
4,wired,Perseverance’s Eyes See a Different Mars,https://www.wired.com/story/perseverances-eyes...,The Red Planet’s red looks different to an Ear...,Adam Rogers,"Tue, 23 Feb 2021 12:00:00 +0000","[{'term': 'Science', 'scheme': None, 'label': ..."
...,...,...,...,...,...,...,...
137,scienceline,A little blue pill that protects you from HIV ...,https://scienceline.org/2021/01/a-little-blue-...,<p>PrEP is close to 99% effective in preventin...,Karen Kwon,"Fri, 29 Jan 2021 13:00:39 +0000","[{'term': 'Health', 'scheme': None, 'label': N..."
138,scienceline,Death of a sourdough,https://scienceline.org/2021/01/death-of-a-sou...,<p>How a neglected sourdough starter can go fr...,Ethan Freedman,"Thu, 28 Jan 2021 17:00:47 +0000","[{'term': 'Audio', 'scheme': None, 'label': No..."
139,scienceline,Terra-farming: A possible step toward growing ...,https://scienceline.org/2021/01/terra-farming-...,<p>Scientists recently developed a substance t...,Jackie Appel,"Wed, 27 Jan 2021 13:00:19 +0000","[{'term': 'Space, Physics, and Math', 'scheme'..."
140,scienceline,Roundup may harm honeybee gut health,https://scienceline.org/2021/01/roundup-may-ha...,<p>The popular weed killer could impair honeyb...,Casey Crownhart,"Mon, 25 Jan 2021 13:00:45 +0000","[{'term': 'Environment', 'scheme': None, 'labe..."


In [13]:
# data cleaning
articles["tags"] = articles.tags.map(lambda x: [_["term"] for _ in x] if type(x)==list else None)

[{'term': 'Perseverance (Mars Rover)',
  'scheme': 'http://www.nytimes.com/namespaces/keywords/des',
  'label': None},
 {'term': 'Mars (Planet)',
  'scheme': 'http://www.nytimes.com/namespaces/keywords/des',
  'label': None},
 {'term': 'Video Recordings, Downloads and Streaming',
  'scheme': 'http://www.nytimes.com/namespaces/keywords/des',
  'label': None},
 {'term': 'National Aeronautics and Space Administration',
  'scheme': 'http://www.nytimes.com/namespaces/keywords/nyt_org',
  'label': None},
 {'term': 'Space and Astronomy',
  'scheme': 'http://www.nytimes.com/namespaces/keywords/des',
  'label': None}]

In [48]:
articles

Unnamed: 0,title,title_detail,links,link,id,guidislink,summary,summary_detail,authors,author,author_detail,published,published_parsed,tags,media_content,media_credit,credit,content,venue
0,Watch Video From NASA’s Perseverance Rover Lan...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/22/science/mar...,https://www.nytimes.com/2021/02/22/science/mar...,False,"Since arriving safely on Thursday, the spacecr...","{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Kenneth Chang'}],Kenneth Chang,{'name': 'Kenneth Chang'},"Mon, 22 Feb 2021 22:07:28 +0000","(2021, 2, 22, 22, 7, 28, 0, 53, 0)","[Perseverance (Mars Rover), Mars (Planet), Vid...","[{'height': '151', 'medium': 'image', 'url': '...","[{'content': 'NASA, via Associated Press'}]","NASA, via Associated Press","[{'type': 'text/plain', 'language': None, 'bas...",scienceline
1,She Beat Cancer at 10. Now She'll Join SpaceX'...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/22/science/spa...,https://www.nytimes.com/2021/02/22/science/spa...,False,"St. Jude Hospital and Jared Isaacman, a billio...","{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Kenneth Chang'}],Kenneth Chang,{'name': 'Kenneth Chang'},"Mon, 22 Feb 2021 18:10:23 +0000","(2021, 2, 22, 18, 10, 23, 0, 53, 0)","[Isaacman, Jared (1983- ), Private Spaceflight...",,,,,scienceline
2,Seven Hundred Leagues Beneath Titan’s Methane ...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/21/science/sat...,https://www.nytimes.com/2021/02/21/science/sat...,False,"Mars, Shmars; this voyager is looking forward ...","{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Dennis Overbye'}],Dennis Overbye,{'name': 'Dennis Overbye'},"Sun, 21 Feb 2021 22:21:33 +0000","(2021, 2, 21, 22, 21, 33, 6, 52, 0)","[Space and Astronomy, Titan (Saturn Moon), Oce...","[{'height': '151', 'medium': 'image', 'url': '...",[{'content': 'NASA/JPL-Caltech/Space Science I...,NASA/JPL-Caltech/Space Science Institute,,scienceline
3,Octopuses Have a Secret Sense to Keep Their 8 ...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/20/science/oct...,https://www.nytimes.com/2021/02/20/science/oct...,False,Even when an octopus can’t see light with its ...,"{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Richard Sima'}],Richard Sima,{'name': 'Richard Sima'},"Sat, 20 Feb 2021 10:00:13 +0000","(2021, 2, 20, 10, 0, 13, 5, 51, 0)","[Light, Research, Octopus, Animal Behavior, An...",,,,,scienceline
4,"Meet Elizabeth Ann, the First Cloned Black-Foo...","{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/18/science/bla...,https://www.nytimes.com/2021/02/18/science/bla...,False,Her birth represents the first cloning of an e...,"{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Sabrina Imbler'}],Sabrina Imbler,{'name': 'Sabrina Imbler'},"Thu, 18 Feb 2021 18:33:06 +0000","(2021, 2, 18, 18, 33, 6, 3, 49, 0)","[Cloning, Ferrets, Endangered and Extinct Spec...",,,,,scienceline
5,Meet the Newest Member of the Fluorescent Mamm...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/18/science/flu...,https://www.nytimes.com/2021/02/18/science/flu...,False,The springhare — whose coat glows a patchy pin...,"{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Cara Giaimo'}],Cara Giaimo,{'name': 'Cara Giaimo'},"Thu, 18 Feb 2021 10:00:20 +0000","(2021, 2, 18, 10, 0, 20, 3, 49, 0)","[your-feed-science, Mammals, Ultraviolet Light...",,,,,scienceline
6,A Hitchhiker’s Guide to an Ancient Geomagnetic...,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/18/science/las...,https://www.nytimes.com/2021/02/18/science/las...,False,"A shift in Earth’s poles 42,000 years ago may ...","{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Alanna Mitchell'}],Alanna Mitchell,{'name': 'Alanna Mitchell'},"Fri, 19 Feb 2021 16:00:13 +0000","(2021, 2, 19, 16, 0, 13, 4, 50, 0)","[Earth, Neanderthal Man, Archaeology and Anthr...",,,,,scienceline
7,A Famous Black Hole Gets a Massive Update,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/18/science/cyg...,https://www.nytimes.com/2021/02/18/science/cyg...,False,"Cygnus X-1, one of the first identified black ...","{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Dennis Overbye'}],Dennis Overbye,{'name': 'Dennis Overbye'},"Thu, 18 Feb 2021 23:25:55 +0000","(2021, 2, 18, 23, 25, 55, 3, 49, 0)","[Space and Astronomy, Stars and Galaxies, Blac...",,,,,scienceline
8,Where Does the Columbian Mammoth Come From?,"{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/17/science/DNA...,https://www.nytimes.com/2021/02/17/science/DNA...,False,Genomic data — the oldest ever recovered from ...,"{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Katherine Kornei'}],Katherine Kornei,{'name': 'Katherine Kornei'},"Wed, 17 Feb 2021 16:00:09 +0000","(2021, 2, 17, 16, 0, 9, 2, 48, 0)","[Mammoths (Animals), Genetics and Heredity, DN...",,,,,scienceline
9,"Hailed as Historic, Biden’s Interior Nominee N...","{'type': 'text/plain', 'language': None, 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://www.nytimes.com/2021/02/22/climate/deb...,https://www.nytimes.com/2021/02/22/climate/deb...,False,Representative Deb Haaland faces her confirmat...,"{'type': 'text/html', 'language': None, 'base'...",[{'name': 'Coral Davenport'}],Coral Davenport,{'name': 'Coral Davenport'},"Mon, 22 Feb 2021 22:17:30 +0000","(2021, 2, 22, 22, 17, 30, 0, 53, 0)","[Interior Department, Native Americans, Global...","[{'height': '151', 'medium': 'image', 'url': '...",[{'content': 'Alex Edelman/Agence France-Press...,Alex Edelman/Agence France-Presse — Getty Images,"[{'type': 'text/plain', 'language': None, 'bas...",scienceline


In [46]:
print(df.to_json(orient='records', lines=True))

{"venue":"wired","title":"Watch Video From NASA\u2019s Perseverance Rover Landing on Mars","link":"https:\/\/www.nytimes.com\/2021\/02\/22\/science\/mars-landing-nasa-video.html","summary":"Since arriving safely on Thursday, the spacecraft has been steadily transmitting visual files and other data back to Earth.","author":"Kenneth Chang","published":"Mon, 22 Feb 2021 22:07:28 +0000","tags":["Perseverance (Mars Rover)","Mars (Planet)","Video Recordings, Downloads and Streaming","National Aeronautics and Space Administration","Space and Astronomy"]}
{"venue":"wired","title":"She Beat Cancer at 10. Now She'll Join SpaceX's First Private Trip to Orbit.","link":"https:\/\/www.nytimes.com\/2021\/02\/22\/science\/spacex-hayley-arceneaux.html","summary":"St. Jude Hospital and Jared Isaacman, a billionaire entrepreneur, selected Hayley Arceneaux for a trip to orbit in a SpaceX capsule.","author":"Kenneth Chang","published":"Mon, 22 Feb 2021 18:10:23 +0000","tags":["Isaacman, Jared (1983- )","Pr

In [8]:
df = pd.DataFrame(d["entries"])