In [1]:
import datetime
from datetime import timedelta
from email.utils import parsedate_tz, mktime_tz
import pytz

# Secondary Pipeline Functions

# Get subreddit name from reddit URL
def get_subreddit(url):
    return "r/" + url.split("r/")[1].split("/")[0]

# Append a column "source" to a DataFrame, df, based on domain name from df['id']
def add_source(df):       
    if df['id'][0].startswith("https://www.reddit"):
        df['source'] = get_subreddit(df['id'][0])
    elif df['id'][0].startswith("https://news.ycomb"):
        df['source'] = "Hacker News"
    elif df['id'][0].startswith("https://www.macrumors"):
        df['source'] = "Mac Rumors"
    else:
        df['source'] = "INSERT NEW SOURCE HERE"
    return df

# Convert strings to time_structs
def parse_news_time(t):
    timestamp = mktime_tz(parsedate_tz(t))
    dt = datetime.datetime.fromtimestamp(timestamp, pytz.timezone('US/Eastern'))
    timestamp = dt.strftime('%Y-%m-%d %H:%M:%S')
    return timestamp

def parse_reddit_time(t):
    pattern = "%Y-%m-%dT%H:%M:%S"
    time_posted = t.split('+')[0]
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted.strftime('%Y-%m-%d %H:%M:%S')

In [2]:
import feedparser
import pandas as pd

# Major Pipleline Functions

def acquire(url, rdata):
    response = feedparser.parse(url)
    rdata.append(response.entries)
    return rdata

# Convert list of RSS responses to a pandas DataFrame
def CreateTheTable(rdata):
    # Initialize list to hold post data from 'rss_data'
    posts = []
    # Convert each RSS entry to a DataFrame and append to list of DFs, 'posts'
    for entry in rdata:
        df = pd.DataFrame(entry)
        # Add new 'source' column
        df = add_source(df)
        # Add new standardized 'time' column
        if df.at[0, 'source'] == "Hacker News" or df.at[0, 'source'] == "Mac Rumors":
            df['time'] = df['published']
        else:
            try:
                df['time'] = df['updated']
            except:
                print (df.iloc[0]['id'])
                print (df.iloc[0]['published'])
                return False
        posts.append(df)
    # Merge list of DataFrames together with concat
    post_data = pd.concat(posts, axis=0, sort=False).reset_index()
    return post_data

def format_data(posts):
    new_time = []
    for i in range(posts.shape[0]): #records in posts
        if "r/" in posts.iloc[i]['source']:
            new_time.append(parse_reddit_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "Hacker News":
            new_time.append(parse_news_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "Mac Rumors":
            new_time.append(parse_news_time(posts.iloc[i]['time']))
        else: #this should never run
            print ("New Source:", posts.iloc[i]['source'])
            return False
    posts['time'] = new_time 
    posts = posts.sort_values(by=["time"], ascending=False).reset_index()
    return posts[['source', 'title', 'link','time']]

def visualize(df):
    return df

def export(df):
    return df

In [3]:
# Main Pipeline Function

def reader (rss_urls):
    rss_data = [] #initialize list to hold the RSS data entries from each URL
    for url in rss_urls:       
        rss_data = acquire(url, rss_data)
    print ('Number of RSS_Data Sources:', len(rss_data))
    # Convert RSS entries to a single DataFrame containing all posts
    posts_table = CreateTheTable(rss_data)
    # Format list of posts into a human-readable table
    return format_data(posts_table)

urls = [#'https://www.reddit.com/r/movies/new.rss', \
        'https://www.reddit.com/r/tezos/new.rss', \
        #'https://www.reddit.com/r/gaming/new.rss', \
        #'https://www.reddit.com/r/todayilearned/new.rss', \
        #'https://www.reddit.com/r/wow/new.rss', \
        #'https://www.reddit.com/r/hearthstone/new.rss', \
        'https://www.reddit.com/r/science/new.rss', \
        'https://hnrss.org/frontpage', \
        #'https://hnrss.org/newest'
        'http://feeds.macrumors.com/MacRumors-All' ] 

rss_feed = reader(urls)
#rss_feed.head()

Number of RSS_Data Sources: 4


In [4]:
import json
import tweepy

# Authenticate with Twitter API
with open("/home/cpg/.ssh/twitter_credentials.json", "r") as file:  
    creds = json.load(file)

auth = tweepy.OAuthHandler(creds['CKEY'], creds['CSEC'])
auth.set_access_token(creds['ATOK'], creds['ASEC'])
api = tweepy.API(auth)

In [5]:
from datetime import datetime
from email.utils import parsedate_tz, mktime_tz
import pytz


# Get feed from twitter
tweets_data = []
feeds = ['elonmusk', 'realDonaldTrump', 'sciencemagazine', 'NatureNews']
for feed in feeds:
    item = api.get_user(feed)
    #print("Getting data for " + feed + ":" + item.data.timezone)
    #print(item.__dict__.keys())
    tweets = api.user_timeline(screen_name = feed, count = 10, include_rts = True) # get last 10 tweets
    for tweet in tweets:
        link = "https://twitter.com/" + feed + "/status/" + str(tweet._json["id"])
        timestamp = mktime_tz(parsedate_tz(tweet._json["created_at"]))
        dt = datetime.fromtimestamp(timestamp, pytz.timezone('US/Eastern'))
        timestamp = dt.strftime('%Y-%m-%d %H:%M:%S')
        tweets_data.append(["@" + feed,tweet._json["text"], link, timestamp])

tweets_data[3]

['@elonmusk',
 'Huge thanks to all Tesla supporters around the world helping the cause. We ♥️♥️♥️ you!! https://t.co/WJqU03wrNH',
 'https://twitter.com/elonmusk/status/1145044275916234752',
 '2019-06-29 15:00:03']

In [6]:
import pandas as pd
column_names = ['source', 'title', 'link', 'time']
tweets = pd.DataFrame(tweets_data, columns=column_names).sort_values(by=['time'], ascending=False).reset_index(drop=True)
tweets.head()

Unnamed: 0,source,title,link,time
0,@sciencemagazine,Tau #proteins spread twice as fast in older br...,https://twitter.com/sciencemagazine/status/114...,2019-06-30 16:00:09
1,@NatureNews,It would have taken divers a month to map the ...,https://twitter.com/NatureNews/status/11454067...,2019-06-30 15:00:12
2,@sciencemagazine,Deciding to leave a Ph.D. program can be a dif...,https://twitter.com/sciencemagazine/status/114...,2019-06-30 14:00:24
3,@NatureNews,Concerns about e-cigarettes have mushroomed an...,https://twitter.com/NatureNews/status/11453840...,2019-06-30 13:30:05
4,@NatureNews,Pollution is not the same the world over. The ...,https://twitter.com/NatureNews/status/11453614...,2019-06-30 12:00:14


In [8]:
final_feed = pd.concat([rss_feed,tweets]).sort_values(by=['time'], ascending=False).reset_index(drop=True)
final_feed

Unnamed: 0,source,title,link,time
0,@sciencemagazine,Tau #proteins spread twice as fast in older br...,https://twitter.com/sciencemagazine/status/114...,2019-06-30 16:00:09
1,Hacker News,Chinese scientists guilty of ‘researching whil...,https://www.scmp.com/magazines/post-magazine/l...,2019-06-30 15:22:25
2,Mac Rumors,OpenID Foundation Claims 'Sign In with Apple' ...,https://www.macrumors.com/2019/06/30/openid-cl...,2019-06-30 15:14:14
3,r/science,Study found that it takes 300 milliseconds to ...,https://www.reddit.com/r/science/comments/c7i9...,2019-06-30 15:12:07
4,r/science,Investments in public transport lead to a subs...,https://www.reddit.com/r/science/comments/c7i9...,2019-06-30 15:10:35
5,@NatureNews,It would have taken divers a month to map the ...,https://twitter.com/NatureNews/status/11454067...,2019-06-30 15:00:12
6,Hacker News,Man completes first round-the-world autogyro f...,https://www.belfasttelegraph.co.uk/news/northe...,2019-06-30 14:36:40
7,Hacker News,Surprise collapse of regional Chinese bank set...,https://www.axios.com/baoshang-bank-china-cent...,2019-06-30 14:06:40
8,@sciencemagazine,Deciding to leave a Ph.D. program can be a dif...,https://twitter.com/sciencemagazine/status/114...,2019-06-30 14:00:24
9,Hacker News,"Rising US Inequality: How We Got Here, Where W...",https://www.gsb.stanford.edu/insights/rising-u...,2019-06-30 13:55:35
