In [1]:
import datetime
from datetime import timedelta 

# Secondary Pipeline Functions

# Get subreddit name from reddit URL
def get_subreddit(url):
    if "todayilearned" in url:
        return "R/TIL"
    elif "hearthstone" in url:
        return "R/HS"
    else:
        return "R/" + url.split("r/")[1].split("/")[0].upper()

# Append a column "source" to a DataFrame, df, based on domain name from df['id']
def add_source(df):       
    if df['id'][0].startswith("https://www.reddit"):
        df['source'] = get_subreddit(df['id'][0])
    elif df['id'][0].startswith("https://news.ycomb"):
        df['source'] = "HNEWS"
    elif df['id'][0].startswith("https://www.macrumors"):
        df['source'] = "MAC"
    else:
        df['source'] = "INSERT NEW SOURCE HERE"
    return df

# Convert strings to time_structs
def parse_news_time(t):
    pattern = "%d %b %Y %H:%M:%S"
    time_posted = t.split(',')[1].strip()
    try: 
        time_posted = time_posted.split('+')[0].strip()
        time_posted = datetime.datetime.strptime(time_posted, pattern)
        time_posted = time_posted + timedelta(hours=-4) #offset for local time (EST)
    except:
        time_posted = time_posted.split('PDT')[0].strip()
        time_posted = datetime.datetime.strptime(time_posted, pattern)
        time_posted = time_posted + timedelta(hours=3) #offset for local time (EST)
    return time_posted
def parse_reddit_time(t):
    pattern = "%Y-%m-%dT%H:%M:%S"
    time_posted = t.split('+')[0]
    time_posted = datetime.datetime.strptime(time_posted, pattern)
    time_posted = time_posted + timedelta(hours=-4)#offset for local time (EST)
    return time_posted

In [2]:
import feedparser
import pandas as pd

# Major Pipleline Functions

def acquire(url, rdata):
    response = feedparser.parse(url)
    rdata.append(response.entries)
    return rdata

# Convert list of RSS responses to a pandas DataFrame
def CreateTheTable(rdata):
    # Initialize list to hold post data from 'rss_data'
    posts = []
    # Convert each RSS entry to a DataFrame and append to list of DFs, 'posts'
    for entry in rdata:
        df = pd.DataFrame(entry)
        # Add new 'source' column
        df = add_source(df)
        # Add new standardized 'time' column
        if df.at[0, 'source'] == "HNEWS" or df.at[0, 'source'] == "MAC":
            df['time'] = df['published']
        else:
            try:
                df['time'] = df['updated']
            except:
                print (df.iloc[0]['id'])
                print (df.iloc[0]['published'])
                return False
        posts.append(df)
    # Merge list of DataFrames together with concat
    post_data = pd.concat(posts, axis=0, sort=False).reset_index()
    return post_data

def format_data(posts):
    new_time = []
    for i in range(posts.shape[0]): #records in posts
        if "R/" in posts.iloc[i]['source']:
            new_time.append(parse_reddit_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "HNEWS":
            new_time.append(parse_news_time(posts.iloc[i]['time']))
        elif posts.iloc[i]['source'] == "MAC":
            new_time.append(parse_news_time(posts.iloc[i]['time']))
        else: #this should never run
            print ("New Source:", posts.iloc[i]['source'])
            return False
    posts['time'] = new_time 
    posts = posts.sort_values(by=["time"], ascending=False).reset_index()
    return posts[['source', 'title', 'link','time']]

def visualize(df):
    return df

def export(df):
    return df

In [3]:
urls = ['https://www.reddit.com/r/movies/new.rss', \
        'https://www.reddit.com/r/tezos/new.rss', \
        'https://www.reddit.com/r/gaming/new.rss', \
        'https://www.reddit.com/r/todayilearned/new.rss', \
        'https://www.reddit.com/r/wow/new.rss', \
        'https://www.reddit.com/r/hearthstone/new.rss', \
        'https://www.reddit.com/r/science/new.rss', \
        'http://feeds.macrumors.com/MacRumors-All', \
        'https://hnrss.org/newest']

In [4]:
# Main Pipeline Function

def reader (rss_urls):
    rss_data = [] #initialize list to hold the RSS data entries from each URL
    for url in rss_urls:       
        rss_data = acquire(url, rss_data)
    print ('Number of RSS_Data Sources:', len(rss_data))
    # Convert RSS entries to a single DataFrame containing all posts
    posts_table = CreateTheTable(rss_data)
    # Format list of posts into a human-readable table
    return format_data(posts_table)

urls = ['https://www.reddit.com/r/movies/new.rss', \
        'https://www.reddit.com/r/gaming/new.rss', \
        'https://hnrss.org/newest']

reader(urls)

Number of RSS_Data Sources: 3


Unnamed: 0,source,title,link,time
0,R/GAMING,At least someone remembered!,https://www.reddit.com/r/gaming/comments/ccggp...,2019-07-12 16:50:44
1,R/MOVIES,Can you you guys tell the difference between f...,https://www.reddit.com/r/movies/comments/ccggf...,2019-07-12 16:50:05
2,R/GAMING,Bob Ross: Ultimate Devil Beater,https://www.reddit.com/r/gaming/comments/ccgge...,2019-07-12 16:50:01
3,R/MOVIES,Watched Zodiac for the first time and I need t...,https://www.reddit.com/r/movies/comments/ccgg3...,2019-07-12 16:49:19
4,R/GAMING,Please delete if not allowed. My gaming TV cra...,https://www.reddit.com/r/gaming/comments/ccgfe...,2019-07-12 16:47:48
5,R/MOVIES,Is there a name for the “character doesn’t bel...,https://www.reddit.com/r/movies/comments/ccges...,2019-07-12 16:46:26
6,R/GAMING,Pre Order Cancellation,https://www.reddit.com/r/gaming/comments/ccgef...,2019-07-12 16:45:33
7,HNEWS,Show HN: NoHQ – Learn how to build a remote team,https://nohq.co,2019-07-12 16:44:30
8,R/GAMING,"Hey gamers of Reddit, I'm looking for an old s...",https://www.reddit.com/r/gaming/comments/ccgdc...,2019-07-12 16:43:14
9,R/GAMING,Can't wait for the Nintendo Switch Lite!,https://www.reddit.com/r/gaming/comments/ccgd9...,2019-07-12 16:43:07


In [5]:
def export(df):
    export_file = df.to_csv('./data/posts.csv', index=False)
export(reader(urls))

Number of RSS_Data Sources: 3


In [16]:
#4chan api 
#https://github.com/4chan/4chan-API
import json
import requests

response = requests.get('https://a.4cdn.org/g/catalog.json')
results = response.json()
results

[{'page': 1,
  'threads': [{'no': 51971506,
    'sticky': 1,
    'closed': 1,
    'now': '12/20/15(Sun)20:03:52',
    'name': 'Anonymous',
    'com': 'The /g/ Wiki:<br><a href="http://wiki.installgentoo.com/">http://wiki.installgentoo.com/</a><br><br>\r\n\r\n/g/ is for the discussion of technology and related topics.<br>\r\n/g/ is <b><u>NOT</u></b> your personal tech support team or personal consumer review site.<br><br>\r\nFor tech support/issues with computers, use <a href="https://boards.4chan.org/wsr/">/wsr/ - Worksafe Requests</a> or one of the following:<br>\r\n<a href="https://startpage.com/">https://startpage.com/</a> or <a href="https://duckduckgo.com">https://duckduckgo.com</a> (i.e., fucking google it)<br>\r\n<a href="https://stackexchange.com/">https://stackexchange.com/</a><br>\r\n<a href="http://www.logicalincrements.com/">http://www.logicalincrements.com/</a><br><br>\r\n\r\nYou can also search the catalog for a specific term by using:<br>\r\n<a href="https://boards.4chan