In [1]:
import datetime
from datetime import timedelta 

# Secondary Pipeline Functions

# Get subreddit name from reddit URL
def get_subreddit(url):
    if "todayilearned" in url:
        return "R/TIL"
    elif "hearthstone" in url:
        return "R/HS"
    else:
        return "R/" + url.split("r/")[1].split("/")[0].upper()

# Append a column "source" to a DataFrame, df, based on domain name from df['id']
def add_source(df):       
    if df['id'][0].startswith("https://www.reddit"):
        df['source'] = get_subreddit(df['id'][0])
    elif df['id'][0].startswith("https://news.ycomb"):
        df['source'] = "HNEWS"
    elif df['id'][0].startswith("https://www.macrumors"):
        df['source'] = "MAC"
    else:
        df['source'] = "INSERT NEW SOURCE HERE"
    return df

# Convert strings to time_structs
def parse_news_time(t):
    pattern = "%d %b %Y %H:%M:%S"
    time_posted = t.split(',')[1].strip()
    try: 
        time_posted = time_posted.split('+')[0].strip()
        time_posted = datetime.datetime.strptime(time_posted, pattern)
        time_posted = time_posted + timedelta(hours=-4) #offset for local time (EST)
    except:
        time_posted = time_posted.split('PDT')[0].strip()
        time_posted = datetime.datetime.strptime(time_posted, pattern)
        time_posted = time_posted + timedelta(hours=3) #offset for local time (EST)
    return time_posted
def parse_reddit_time(t):
    pattern = "%Y-%m-%dT%H:%M:%S"
    time_posted = t.split('+')[0]
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted

In [2]:
import feedparser
import pandas as pd

# Major Pipleline Functions

def acquire(url, rdata):
    response = feedparser.parse(url)
    rdata.append(response.entries)
    return rdata

# Convert list of RSS responses to a pandas DataFrame
def CreateTheTable(rdata):
    # Initialize list to hold post data from 'rss_data'
    posts = []
    # Convert each RSS entry to a DataFrame and append to list of DFs, 'posts'
    for entry in rdata:
        df = pd.DataFrame(entry)
        # Add new 'source' column
        df = add_source(df)
        # Add new standardized 'time' column
        if df.at[0, 'source'] == "HNEWS" or df.at[0, 'source'] == "MAC":
            df['time'] = df['published']
        else:
            try:
                df['time'] = df['updated']
            except:
                print (df.iloc[0]['id'])
                print (df.iloc[0]['published'])
                return False
        posts.append(df)
    # Merge list of DataFrames together with concat
    post_data = pd.concat(posts, axis=0, sort=False).reset_index()
    return post_data

def format_data(posts):
    new_time = []
    for i in range(posts.shape[0]): #records in posts
        if "R/" in posts.iloc[i]['source']:
            new_time.append(parse_reddit_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "HNEWS":
            new_time.append(parse_news_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "MAC":
            new_time.append(parse_news_time(posts.iloc[i]['time']))
        else: #this should never run
            print ("New Source:", posts.iloc[i]['source'])
            return False
    posts['time'] = new_time 
    posts = posts.sort_values(by=["time"], ascending=False).reset_index()
    return posts[['source', 'title', 'link','time']]

def visualize(df):
    return df

def export(df):
    return df

In [5]:
# Main Pipeline Function

def reader (rss_urls):
    rss_data = [] #initialize list to hold the RSS data entries from each URL
    for url in rss_urls:       
        rss_data = acquire(url, rss_data)
    print ('Number of RSS_Data Sources:', len(rss_data))
    # Convert RSS entries to a single DataFrame containing all posts
    posts_table = CreateTheTable(rss_data)
    # Format list of posts into a human-readable table
    return format_data(posts_table)

urls = ['https://www.reddit.com/r/movies/new.rss', \
        'https://www.reddit.com/r/tezos/new.rss', \
        'https://www.reddit.com/r/gaming/new.rss', \
        'https://www.reddit.com/r/todayilearned/new.rss', \
        'https://www.reddit.com/r/wow/new.rss', \
        'https://www.reddit.com/r/hearthstone/new.rss', \
        'https://www.reddit.com/r/science/new.rss', \
        'http://feeds.macrumors.com/MacRumors-All', \
        'https://hnrss.org/newest']

reader(urls)

Number of RSS_Data Sources: 9


Unnamed: 0,source,title,link,time
200,MAC,JCPenney Has Reinstated Apple Pay in All Retai...,https://www.macrumors.com/2019/06/27/jcpenney-...,2019-06-27 16:14:44
201,MAC,Apple Shares New AirPods Ad Highlighting Wirel...,https://www.macrumors.com/2019/06/27/apple-air...,2019-06-27 15:56:30
202,R/TEZOS,Please Post Price and Trading Discussions to r...,https://www.reddit.com/r/tezos/comments/c6a1zn...,2019-06-27 15:45:46
203,R/TEZOS,Who's buying at this price?,https://www.reddit.com/r/tezos/comments/c695xh...,2019-06-27 14:34:03
204,MAC,Apple Music Now Has 60 Million Paid Subscribers,https://www.macrumors.com/2019/06/27/apple-mus...,2019-06-27 14:28:12
205,R/TEZOS,Log of all Activation Hashs and Stuff,https://www.reddit.com/r/tezos/comments/c690k1...,2019-06-27 14:21:58
206,MAC,Google Maps for iOS Gains Live Traffic Info fo...,https://www.macrumors.com/2019/06/27/google-ma...,2019-06-27 13:52:13
207,MAC,Review: Ultimate Ears' Wonderboom 2 Speaker Of...,https://www.macrumors.com/review/ultimate-ears...,2019-06-27 13:15:08
208,MAC,Apple Head of Security Engineering to Speak Ab...,https://www.macrumors.com/2019/06/27/apple-sec...,2019-06-27 13:02:45
209,MAC,Mophie Juice Pack Air Battery Cases for iPhone...,https://www.macrumors.com/2019/06/27/mophie-ju...,2019-06-27 12:51:06
