In [1]:
import datetime
from datetime import timedelta 

# Secondary Pipeline Functions

# Get subreddit name from reddit URL
def get_subreddit(url):
    if "todayilearned" in url:
        return "R/TIL"
    elif "hearthstone" in url:
        return "R/HS"
    else:
        return "R/" + url.split("r/")[1].split("/")[0].upper()

# Append a column "source" to a DataFrame, df, based on domain name from df['id']
def add_source(df):       
    if df['id'][0].startswith("https://www.reddit"):
        df['source'] = get_subreddit(df['id'][0])
    elif df['id'][0].startswith("https://news.ycomb"):
        df['source'] = "HNEWS"
    else:
        df['source'] = "INSERT NEW SOURCE HERE"
    return df

# Convert strings to time_structs
def parse_hnews_time(t):
    pattern = "%d %b %Y %H:%M:%S"
    time_posted = t.split(',')[1].strip()
    time_posted = time_posted.split('+')[0].strip()
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted
def parse_reddit_time(t):
    pattern = "%Y-%m-%dT%H:%M:%S"
    time_posted = t.split('+')[0]
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted

In [2]:
import feedparser
import pandas as pd

# Major Pipleline Functions

def acquire(url, rdata):
    response = feedparser.parse(url)
    rdata.append(response.entries)
    return rdata

# Convert list of RSS responses to a pandas DataFrame
def CreateTheTable(rdata):
    # Initialize list to hold post data from 'rss_data'
    posts = []
    # Convert each RSS entry to a DataFrame and append to list of DFs, 'posts'
    for entry in rdata:
        df = pd.DataFrame(entry)
        # Add new 'source' column
        df = add_source(df)
        # Add new standardized 'time' column
        if df.at[0, 'source'] == "HNEWS":
            df['time'] = df['published']
        else:
            df['time'] = df['updated']
        posts.append(df)
    # Merge list of DataFrames together with concat
    post_data = pd.concat(posts, axis=0, sort=False).reset_index()
    return post_data

def format_data(posts):
    new_time = []
    for i in range(posts.shape[0]): #records in posts
        if "R/" in posts.iloc[i]['source']:
            new_time.append(parse_reddit_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "HNEWS":
            new_time.append(parse_hnews_time(posts.iloc[i]['time']))
        else: #this should never run
            print ("New Source:", posts.iloc[i]['source'])
            return False
    posts['time'] = new_time 
    posts = posts.sort_values(by=["time"], ascending=False).reset_index()
    return posts[['source', 'title', 'link','time']]

def visualize(df):
    return df

#def export(df):
    #return df

In [3]:
# Main Pipeline Function

def reader (rss_urls):
    rss_data = [] #initialize list to hold the RSS data entries from each URL
    for url in rss_urls:       
        rss_data = acquire(url, rss_data)
    print ('Number of RSS_Data Sources:', len(rss_data))
    # Convert RSS entries to a single DataFrame containing all posts
    posts_table = CreateTheTable(rss_data)
    # Format list of posts into a human-readable table
    return format_data(posts_table)

urls = ['https://www.reddit.com/r/movies/new.rss', \
        'https://www.reddit.com/r/tezos/new.rss', \
        'https://www.reddit.com/r/gaming/new.rss', \
        'https://www.reddit.com/r/todayilearned/new.rss', \
        'https://www.reddit.com/r/wow/new.rss', \
        'https://www.reddit.com/r/hearthstone/new.rss', \
        'https://www.reddit.com/r/science/new.rss', \
        'https://hnrss.org/newest']

reader(urls).head(50) #display first 15 posts

#TODO: Add a third RSS source

Number of RSS_Data Sources: 8


Unnamed: 0,source,title,link,time
0,R/TIL,TIL about a site that summarize the terms of s...,https://www.reddit.com/r/todayilearned/comment...,2019-06-29 04:59:41
1,R/MOVIES,Is there a place to watch all avengers movies?,https://www.reddit.com/r/movies/comments/c6wox...,2019-06-29 04:58:05
2,R/MOVIES,To Live and Die in LA is an under appreciated ...,https://www.reddit.com/r/movies/comments/c6wov...,2019-06-29 04:57:52
3,R/GAMING,The only thing i don't like about cadence is h...,https://www.reddit.com/r/gaming/comments/c6woh...,2019-06-29 04:56:10
4,R/GAMING,I feel like this is how E3 went.,https://www.reddit.com/r/gaming/comments/c6wof...,2019-06-29 04:56:00
5,HNEWS,Synapse: Matrix homeserver for federated IM,https://github.com/matrix-org/synapse/,2019-06-29 04:55:36
6,R/GAMING,Steams Summer Sale mini game,https://www.reddit.com/r/gaming/comments/c6wo9...,2019-06-29 04:55:12
7,HNEWS,GraphQL – The Pros and the Cons,https://ednsquare.com/story/graphql-the-pros-a...,2019-06-29 04:54:39
8,R/TIL,TIL The first baseball series champions were E...,https://www.reddit.com/r/todayilearned/comment...,2019-06-29 04:52:56
9,R/MOVIES,Then and now - the action movie heroes of my y...,https://www.reddit.com/r/movies/comments/c6wn8...,2019-06-29 04:51:28
