### Top 100 keywords of latest Hacker News posts

To do:
    Use Hacker New API to get the data instead of downloaded .json file

In [381]:
import csv
from datetime import datetime
import io
import json
from stop_words import stop_words
import string

from pipeline import Pipeline
from pipeline import build_csv

In [382]:
pipeline = Pipeline()

In [383]:
# @pipeline.task()
# def file_to_json():
#     result = []
#     data = open('hn_stories_2014.json').read()
#     data = json.loads(data)
#     if 'stories' in data.keys():
#         result = data['stories']
#     return result

In [449]:
start = datetime.now()

@pipeline.task()
def get_data():
    """
    Returns list of dicts where each dict contains story data from hackernews.
    Loops through url for maximum 50 times or until nbPages from request result is 0.
    """
    result = []
    url = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page={}'
    for page in range(50):
        page_url = url.format(page)
        response = requests.get(page_url)
        data = response.json()
        if data['nbPages'] == 0:
            break
        result += data.get('hits', [])
    return result

print(len(get_data()))
end = datetime.now()
(end - start).total_seconds()

http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=0
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=1
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=2
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=3
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=4
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=5
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=6
http://hn.algolia.com/api/v1/search_by_date?tags=story&

4.37835

In [438]:
result = get_data()

http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=0
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100&page=1


In [439]:
result

[{'created_at': '2019-07-28T02:40:01.000Z',
  'title': 'How I use the good parts of AWS',
  'url': 'https://twitter.com/dvassallo/status/1154516910265884672',
  'author': 'DVassallo',
  'points': 177,
  'story_text': None,
  'comment_text': None,
  'num_comments': 90,
  'story_id': None,
  'story_title': None,
  'story_url': None,
  'parent_id': None,
  'created_at_i': 1564281601,
  'relevancy_score': 8965,
  '_tags': ['story', 'author_DVassallo', 'story_20545561', 'front_page'],
  'objectID': '20545561',
  '_highlightResult': {'title': {'value': 'How I use the good parts of AWS',
    'matchLevel': 'none',
    'matchedWords': []},
   'url': {'value': 'https://twitter.com/dvassallo/status/1154516910265884672',
    'matchLevel': 'none',
    'matchedWords': []},
   'author': {'value': 'DVassallo',
    'matchLevel': 'none',
    'matchedWords': []}}},
 {'created_at': '2019-07-28T02:00:14.000Z',
  'title': 'Apple: No Macintosh Forks. But the iPad...',
  'url': 'https://mondaynote.com/apple-n

In [16]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(data):
    for row in data:
        if (
            (not row['title'].startswith('Ask HN'))
            and 
            (row['points'] > 50)
            and
            (row['num_comments'] > 1)
           ):
            yield row

In [17]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(data):
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    lines = []
    for row in data:
        lines.append(
            (
            row['objectID'],
            datetime.strptime(row['created_at'][:10], '%Y-%m-%d'),
            row['url'],
            row['points'],
            row['title'],
            )
        )
    return build_csv(
        lines,
        header=header,
        file=io.StringIO()
    )

In [18]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(file_obj):
    for row in csv.reader(file_obj):
        yield row[4]

In [19]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    """
    Generator of titles (lowercased and punctuations removed)
    """
    for title in titles:
        for char in string.punctuation:
            title = title.replace(char, '')
        yield title.lower()

In [20]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    """
    Returns keyword dictionary excluding stop words
    """
    result = {}
    for title in titles:
        words = title.split(' ')
        for word in words:
            if not word or word in stop_words:
                continue
            if word not in result:
                result[word] = 0
            result[word] += 1
    return result

In [21]:
@pipeline.task(depends_on=build_keyword_dictionary)
def sort_keywords(keyword_dict):
    """
    Returns list of tuples sorted by frequency
    """
    popular_words = sorted(keyword_dict, key=lambda x: keyword_dict[x], reverse=True)[:100]
    return [(x, keyword_dict[x]) for x in popular_words]

In [35]:
def main():
    result = pipeline.run()
    keywords_dict = result[list(result.keys())[-1]]
    print('Top 100 keywords of Hacker News posts in 2014:\n')
    for word, freq in keywords_dict:
        print('{}: {}'.format(word, freq))

In [36]:
main()

Top 100 keywords of Hacker News posts in 2014:

new: 185
google: 167
bitcoin: 101
open: 92
programming: 90
web: 88
data: 85
video: 79
python: 75
code: 72
facebook: 71
released: 71
using: 70
2013: 65
javascript: 65
free: 64
source: 64
game: 63
internet: 62
microsoft: 59
c: 59
linux: 58
app: 57
pdf: 55
work: 54
language: 54
software: 52
2014: 52
startup: 51
apple: 50
use: 50
make: 50
time: 48
yc: 48
security: 48
nsa: 45
github: 45
windows: 44
world: 41
way: 41
like: 41
1: 40
project: 40
computer: 40
heartbleed: 40
git: 37
users: 37
dont: 37
design: 37
ios: 37
developer: 36
os: 36
twitter: 36
ceo: 36
vs: 36
life: 36
big: 35
day: 35
android: 34
online: 34
years: 33
simple: 33
court: 33
guide: 32
learning: 32
mt: 32
api: 32
says: 32
apps: 32
browser: 32
server: 31
firefox: 31
fast: 31
gox: 31
problem: 31
mozilla: 31
engine: 31
site: 31
introducing: 30
amazon: 30
year: 30
support: 29
stop: 29
built: 29
better: 29
million: 29
people: 29
text: 29
3: 28
does: 28
tech: 28
development: 28
billion

In [1]:
import requests

In [387]:
def get_data():
    result = []
    url = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page={}'
    for page in range(50):
        page_url = url.format(page)
        print(page_url)
        response = requests.get(page_url)
        data = response.json().get('hits', [])
        result += data
    return result


In [388]:
result = get_data()

http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=0
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=1
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=2
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=3
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=4
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=5
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&page=6
http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1

In [416]:
url = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&numericFilters=created_at_i>2011-01-01,points>50,num_comments>1&maxFacetHits=100&hitsPerPage=100'

In [417]:
a = requests.get(url)

In [418]:
len(a.json()['hits'])

100

In [419]:
a.json().keys()

dict_keys(['hits', 'nbHits', 'page', 'nbPages', 'hitsPerPage', 'processingTimeMS', 'exhaustiveNbHits', 'query', 'params'])

In [364]:
a.json()['hitsPerPage']

20

In [365]:
a.json()['nbPages']

50

In [420]:
a.json()['hits'][0]

{'created_at': '2019-07-28T02:40:01.000Z',
 'title': 'How I use the good parts of AWS',
 'url': 'https://twitter.com/dvassallo/status/1154516910265884672',
 'author': 'DVassallo',
 'points': 174,
 'story_text': None,
 'comment_text': None,
 'num_comments': 89,
 'story_id': None,
 'story_title': None,
 'story_url': None,
 'parent_id': None,
 'created_at_i': 1564281601,
 'relevancy_score': 8965,
 '_tags': ['story', 'author_DVassallo', 'story_20545561', 'front_page'],
 'objectID': '20545561',
 '_highlightResult': {'title': {'value': 'How I use the good parts of AWS',
   'matchLevel': 'none',
   'matchedWords': []},
  'url': {'value': 'https://twitter.com/dvassallo/status/1154516910265884672',
   'matchLevel': 'none',
   'matchedWords': []},
  'author': {'value': 'DVassallo', 'matchLevel': 'none', 'matchedWords': []}}}

In [356]:
a.json()['hits'][0].keys()

dict_keys(['created_at', 'title', 'url', 'author', 'points', 'story_text', 'comment_text', 'num_comments', 'story_id', 'story_title', 'story_url', 'parent_id', 'created_at_i', 'relevancy_score', '_tags', 'objectID', '_highlightResult'])

In [421]:
for x in a.json()['hits']:
    print(x['objectID'], x['created_at'], sep='\t>>>\t')

20545561	>>>	2019-07-28T02:40:01.000Z
20545438	>>>	2019-07-28T02:00:14.000Z
20545276	>>>	2019-07-28T01:05:17.000Z
20545257	>>>	2019-07-28T00:59:41.000Z
20545067	>>>	2019-07-28T00:10:46.000Z
20544564	>>>	2019-07-27T22:12:26.000Z
20544395	>>>	2019-07-27T21:45:43.000Z
20544222	>>>	2019-07-27T21:12:18.000Z
20544076	>>>	2019-07-27T20:43:58.000Z
20544058	>>>	2019-07-27T20:39:53.000Z
20543646	>>>	2019-07-27T19:24:11.000Z
20543605	>>>	2019-07-27T19:16:51.000Z
20543495	>>>	2019-07-27T18:52:51.000Z
20543223	>>>	2019-07-27T18:02:03.000Z
20543077	>>>	2019-07-27T17:34:25.000Z
20542961	>>>	2019-07-27T17:12:49.000Z
20542915	>>>	2019-07-27T17:04:15.000Z
20542862	>>>	2019-07-27T16:53:59.000Z
20542738	>>>	2019-07-27T16:29:30.000Z
20542731	>>>	2019-07-27T16:28:07.000Z
20542606	>>>	2019-07-27T16:00:49.000Z
20542470	>>>	2019-07-27T15:34:14.000Z
20542337	>>>	2019-07-27T15:06:58.000Z
20542333	>>>	2019-07-27T15:05:39.000Z
20542269	>>>	2019-07-27T14:52:27.000Z
20542258	>>>	2019-07-27T14:47:49.000Z
20542225	>>>