In [1]:
from pipeline import Pipeline, build_csv
from stop_words import stop_words
from pprint import pprint
from datetime import datetime
from collections import Counter
import string
import json
import csv
import io

In [2]:
pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json') as file:
        stories = json.loads(file.read())['stories']
    return stories


@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    for story in stories:
        condition = story['points'] > 50\
            and story['num_comments'] > 1\
            and not story['title'].startswith('Ask HN')
        if condition:
            yield story

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines = []
    for story in stories:
        cols = ['objectID', 'created_at_i', 'url', 'points', 'title']
        line = [story[col] for col in cols]
        line[1] = datetime.fromtimestamp(line[1] / 1e3)
        lines.append(line)
    return build_csv(lines, header=cols, file=io.StringIO())

@pipeline.task(depends_on=json_to_csv)
def extract_titles(file):
    csv_file = csv.reader(file)
    header = next(csv_file)
    title_idx = header.index('title')
    for line in csv_file:
        yield line[title_idx]

@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        title = title.lower().translate(str.maketrans('', '', string.punctuation))
        yield title


@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    for title in titles:
        words = title.split()
        words = [word for word in words if word not in stop_words and word != '']
        counter = Counter(words)
        yield counter


@pipeline.task(depends_on=build_keyword_dictionary)
def top_words(counters):
    top_words = Counter()
    for counter in counters:
        top_words.update(counter)
    return top_words.most_common(100)

In [3]:
count = pipeline.run()

In [5]:
count[top_words]

[('new', 185),
 ('google', 167),
 ('bitcoin', 101),
 ('open', 92),
 ('programming', 90),
 ('web', 88),
 ('data', 85),
 ('video', 79),
 ('python', 76),
 ('code', 72),
 ('released', 71),
 ('facebook', 71),
 ('using', 70),
 ('javascript', 65),
 ('2013', 65),
 ('free', 64),
 ('source', 64),
 ('game', 63),
 ('internet', 62),
 ('c', 59),
 ('microsoft', 59),
 ('linux', 58),
 ('app', 57),
 ('pdf', 55),
 ('language', 54),
 ('work', 54),
 ('2014', 52),
 ('software', 52),
 ('startup', 51),
 ('apple', 50),
 ('use', 50),
 ('make', 50),
 ('security', 48),
 ('time', 48),
 ('yc', 48),
 ('github', 45),
 ('nsa', 45),
 ('windows', 44),
 ('way', 41),
 ('world', 41),
 ('like', 41),
 ('1', 41),
 ('project', 40),
 ('computer', 40),
 ('heartbleed', 40),
 ('git', 37),
 ('ios', 37),
 ('design', 37),
 ('dont', 37),
 ('users', 37),
 ('ceo', 36),
 ('os', 36),
 ('twitter', 36),
 ('life', 36),
 ('vs', 36),
 ('developer', 36),
 ('day', 35),
 ('big', 35),
 ('android', 34),
 ('online', 34),
 ('court', 33),
 ('simple', 

### TO DO
_____
* Rewrite the Pipeline class' output to save a file of the output for each task. This will allow "checkpoint" tasks so they don't have to be run twice.
* Use the nltk package for more advanced natural language processing tasks.
* Convert to a CSV before filtering, so we can keep all the stories from 2014 in a raw file.
* Fetch the data from Hacker News directly from a JSON API. Instead of reading from the file provided, 
* Perform additional data processing using newer data