In [1]:
import datetime
from datetime import timedelta 

# Secondary Pipeline Functions

# Append a column "source" to a DataFrame, df, based on domain name from df['id']
def add_source(df):       
    if df['id'][0].startswith("https://www.reddit"):
        df['source'] = "REDDIT"
    elif df['id'][0].startswith("https://news.ycomb"):
        df['source'] = "HNEWS"
    else:
        df['source'] = "INSERT NEW SOURCE HERE"
    return df

# Convert strings to time_structs
def parse_hnews_time(t):
    pattern = "%d %b %Y %H:%M:%S"
    time_posted = t.split(',')[1].strip()
    time_posted = time_posted.split('+')[0].strip()
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted
def parse_reddit_time(t):
    pattern = "%Y-%m-%dT%H:%M:%S"
    time_posted = t.split('+')[0]
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted

In [2]:
import feedparser
import pandas as pd

# Major Pipleline Functions

def acquire(url, rdata):
    response = feedparser.parse(url)
    rdata.append(response.entries)
    return rdata

# Convert list of RSS responses to a pandas DataFrame
def CreateTheTable(rdata):
    # Initialize list to hold post data from 'rss_data'
    posts = []
    # Convert each RSS entry to a DataFrame and append to list of DFs, 'posts'
    for entry in rdata:
        df = pd.DataFrame(entry)
        # Add new 'source' column
        df = add_source(df)
        # Add new standardized 'time' column
        if df.at[0, 'source'] == "HNEWS":
            df['time'] = df['published']
        else:
            df['time'] = df['updated']
        posts.append(df)
    # Merge list of DataFrames together with concat
    post_data = pd.concat(posts, axis=0, sort=False).reset_index()
    return post_data

def format_data(posts):
    new_time = []
    for i in range(posts.shape[0]): #records in posts
        if posts.iloc[i]['source'] == "REDDIT":
            new_time.append(parse_reddit_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "HNEWS":
            new_time.append(parse_hnews_time(posts.iloc[i]['time']))
        else: #this should never run
            print ("New Source:", posts.iloc[i]['source'])
            return False
    posts['time'] = new_time 
    posts = posts.sort_values(by=["time"], ascending=False).reset_index()
    return posts[['source', 'title', 'link','time']]

def visualize(df):
    return df

#def export(df):
    #return df

In [3]:
# Main Pipeline Function

def reader (rss_urls):
    rss_data = [] #initialize list to hold the RSS data entries from each URL
    for url in rss_urls:       
        rss_data = acquire(url, rss_data)
    print ('Number of RSS_Data Sources:', len(rss_data))
    # Convert RSS entries to a single DataFrame containing all posts
    posts_table = CreateTheTable(rss_data)
    # Format list of posts into a human-readable table
    return format_data(posts_table)

urls = ['https://www.reddit.com/r/movies/new.rss', \
        'https://hnrss.org/newest']

reader(urls).head(15) #display first 15 posts

#TODO: Add a third RSS source

Number of RSS_Data Sources: 2


Unnamed: 0,source,title,link,time
0,REDDIT,The Toy Story Theory (Karsten Runquist),https://www.reddit.com/r/movies/comments/c6wis...,2019-06-29 04:34:41
1,HNEWS,Taser Maker Says It Won't Use Facial Recogniti...,https://www.wired.com/story/taser-maker-wont-u...,2019-06-29 04:27:06
2,HNEWS,Hierarchy Is Not the Problem,https://www.linkedin.com/pulse/hierarchy-probl...,2019-06-29 04:23:12
3,REDDIT,Yesterday Easter Egg [No Spoilers],https://www.reddit.com/r/movies/comments/c6weo...,2019-06-29 04:19:38
4,HNEWS,Bioelectricity and IQ,https://philcatbioflow.com/bioelectricity-and-iq/,2019-06-29 04:19:21
5,HNEWS,Rockstar Games Will Curate an AutoMap for GTA 6,https://www.isthemessage.net/games/rockstar-ga...,2019-06-29 04:19:11
6,HNEWS,Package Hardening Asymptote,https://outflux.net/blog/archives/2019/06/27/p...,2019-06-29 04:17:57
7,HNEWS,The Most Important Skill a Programmer Can Learn,http://huseyinpolatyuruk.com/2019/05/03/the-mo...,2019-06-29 04:11:15
8,HNEWS,Secure Languages Now – Cristina Cifuentes – Pl...,https://www.youtube.com/watch?v=TOXvi6Y4Wc8,2019-06-29 04:04:50
9,HNEWS,"Carola Rackete, migrant rescue ship captain ar...",https://www.bbc.com/news/world-europe-48809134,2019-06-29 04:03:47
