### Top 10 keywords of latest Hacker News posts

    1. Gets data from latest 1000 posts on Hacker news with atleast 51 points and 2 comments.
    2. Removes any posts that start with 'Ask Hn'.
    3. Creates non case-sensitive keyword dictionary with stop words and punctuations.
    

In [499]:
import csv
from datetime import datetime
import io
import json
from stop_words import stop_words
import string

from pipeline import Pipeline
from pipeline import build_csv

In [500]:
pipeline = Pipeline()

In [501]:
@pipeline.task()
def get_raw_data():
    """
    Returns list of dicts where each dict contains story data from hackernews.
    Loops through url for maximum 50 times or until nbPages from request result is 0.
    """
    result = []
    url = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page={}'
    for page in range(50):
        page_url = url.format(page)
        response = requests.get(page_url)
        data = response.json()
        if data['nbPages'] == 0:
            break
        result += data.get('hits', [])
    return result

In [502]:
@pipeline.task(depends_on=get_raw_data)
def filter_stories(data):
    for row in data:
        if not row['title'].startswith('Ask HN'):
            yield row

In [503]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(data):
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    lines = []
    for row in data:
        lines.append(
            (
            row['objectID'],
            datetime.strptime(row['created_at'][:10], '%Y-%m-%d'),
            row['url'],
            row['points'],
            row['title'],
            )
        )
    return build_csv(
        lines,
        header=header,
        file=io.StringIO()
    )

In [504]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(file_obj):
    for row in csv.reader(file_obj):
        yield row[4]

In [505]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    """
    Generator of titles (lowercased and punctuations removed)
    """
    for title in titles:
        for char in string.punctuation:
            title = title.replace(char, '')
        yield title.lower()

In [506]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    """
    Returns keyword dictionary excluding stop words
    """
    result = {}
    for title in titles:
        words = title.split(' ')
        for word in words:
            if not word or word in stop_words:
                continue
            if word not in result:
                result[word] = 0
            result[word] += 1
    return result

In [507]:
@pipeline.task(depends_on=build_keyword_dictionary)
def sort_keywords(keyword_dict):
    """
    Returns list of tuples sorted by frequency
    """
    popular_words = sorted(keyword_dict, key=lambda x: keyword_dict[x], reverse=True)[:10]
    return [(x, keyword_dict[x]) for x in popular_words]

In [510]:
def main():
    result = pipeline.run()
    keywords_dict = result[list(result.keys())[-1]]
    print('Top keywords of latest Hacker News posts:\n')
    for word, freq in keywords_dict:
        print('{}: {}'.format(word, freq))

In [511]:
main()

Top keywords of latest Hacker News posts:

new: 27
google: 27
pdf: 25
data: 23
programming: 21
2018: 19
use: 16
facebook: 15
amazon: 15
2017: 15
