### Top 100 keywords of Hacker News posts in 2014

In [13]:
import csv
from datetime import datetime
import io
import json
from stop_words import stop_words
import string

from pipeline import Pipeline
from pipeline import build_csv

In [14]:
pipeline = Pipeline()

In [15]:
@pipeline.task()
def file_to_json():
    result = []
    data = open('hn_stories_2014.json').read()
    data = json.loads(data)
    if 'stories' in data.keys():
        result = data['stories']
    return result

In [16]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(data):
    for row in data:
        if (
            (not row['title'].startswith('Ask HN'))
            and 
            (row['points'] > 50)
            and
            (row['num_comments'] > 1)
           ):
            yield row

In [17]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(data):
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    lines = []
    for row in data:
        lines.append(
            (
            row['objectID'],
            datetime.strptime(row['created_at'][:10], '%Y-%m-%d'),
            row['url'],
            row['points'],
            row['title'],
            )
        )
    return build_csv(
        lines,
        header=header,
        file=io.StringIO()
    )

In [18]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(file_obj):
    for row in csv.reader(file_obj):
        yield row[4]

In [19]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    """
    Generator of titles (lowercased and punctuations removed)
    """
    for title in titles:
        for char in string.punctuation:
            title = title.replace(char, '')
        yield title.lower()

In [20]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    """
    Returns keyword dictionary excluding stop words
    """
    result = {}
    for title in titles:
        words = title.split(' ')
        for word in words:
            if not word or word in stop_words:
                continue
            if word not in result:
                result[word] = 0
            result[word] += 1
    return result

In [21]:
@pipeline.task(depends_on=build_keyword_dictionary)
def sort_keywords(keyword_dict):
    """
    Returns list of tuples sorted by frequency
    """
    popular_words = sorted(keyword_dict, key=lambda x: keyword_dict[x], reverse=True)[:100]
    return [(x, keyword_dict[x]) for x in popular_words]

In [35]:
def main():
    result = pipeline.run()
    keywords_dict = result[list(result.keys())[-1]]
    print('Top 100 keywords of Hacker News posts in 2014:\n')
    for word, freq in keywords_dict:
        print('{}: {}'.format(word, freq))

In [36]:
main()

Top 100 keywords of Hacker News posts in 2014:

new: 185
google: 167
bitcoin: 101
open: 92
programming: 90
web: 88
data: 85
video: 79
python: 75
code: 72
facebook: 71
released: 71
using: 70
2013: 65
javascript: 65
free: 64
source: 64
game: 63
internet: 62
microsoft: 59
c: 59
linux: 58
app: 57
pdf: 55
work: 54
language: 54
software: 52
2014: 52
startup: 51
apple: 50
use: 50
make: 50
time: 48
yc: 48
security: 48
nsa: 45
github: 45
windows: 44
world: 41
way: 41
like: 41
1: 40
project: 40
computer: 40
heartbleed: 40
git: 37
users: 37
dont: 37
design: 37
ios: 37
developer: 36
os: 36
twitter: 36
ceo: 36
vs: 36
life: 36
big: 35
day: 35
android: 34
online: 34
years: 33
simple: 33
court: 33
guide: 32
learning: 32
mt: 32
api: 32
says: 32
apps: 32
browser: 32
server: 31
firefox: 31
fast: 31
gox: 31
problem: 31
mozilla: 31
engine: 31
site: 31
introducing: 30
amazon: 30
year: 30
support: 29
stop: 29
built: 29
better: 29
million: 29
people: 29
text: 29
3: 28
does: 28
tech: 28
development: 28
billion