In [1]:
from pipeline import Pipeline,build_csv
import json
import io
import datetime
import csv
import string
from stop_words import stop_words

In [2]:
pipeline=Pipeline()

In [3]:
@pipeline.task()
def open_json():
    with open('hn_stories_2014.json') as f:
        file=json.load(f)
        return (story for story in file['stories'])
@pipeline.task(depends_on=open_json)
def filter_stories(stories):
    return (x for x in list(filter(lambda x: x['points']>50 and x['num_comments']>1 and x['title'][:6]!='Ask HN',stories)))
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    rows=[]
    for row in stories:
        rows.append([row['objectID'],
             datetime.datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ'),
             row['url'],row['points'],row['title']])
    return build_csv(rows,header=['objectID','created_at','url','points','title'],file=io.StringIO())

@pipeline.task(depends_on=json_to_csv)
def extract_titles(file):
    reader=csv.reader(file)
    header = next(reader)
    idx = header.index('title')
    for row in reader:
        yield row[idx]
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        yield title.lower().replace(string.punctuation,'')
        
@pipeline.task(depends_on=clean_titles)
def build_keyword_dic(titles):
    dic={}
    for title in titles:
        words=title.split()
        for word in words:
            if word not in dic:
                dic[word]=0
            dic[word]+=1
    for word in stop_words:
        if word in dic:
            dic.pop(word)
    return dic
@pipeline.task(depends_on=build_keyword_dic)
def find_top_100(dic):
    return sorted(dic.items(),key=lambda item: item[1],reverse=True)[:100]

In [4]:
pipeline.run()

{<function __main__.open_json()>: <generator object open_json.<locals>.<genexpr> at 0x7f823a72dcf0>,
 <function __main__.filter_stories(stories)>: <generator object filter_stories.<locals>.<genexpr> at 0x7f82701cf0b0>,
 <function __main__.json_to_csv(stories)>: <_io.StringIO at 0x7f82702d8820>,
 <function __main__.extract_titles(file)>: <generator object extract_titles at 0x7f82701cf9e0>,
 <function __main__.clean_titles(titles)>: <generator object clean_titles at 0x7f82701cfcf0>,
 <function __main__.build_keyword_dic(titles)>: {'true': 5,
  'goodbye:': 1,
  '‘using': 1,
  'truecrypt': 4,
  'secure’': 1,
  'hire:': 1,
  'dedicated': 6,
  'young': 9,
  'man': 21,
  'syndrome': 2,
  'absolute': 1,
  'zero': 5,
  'joshua': 1,
  'norton,': 1,
  'emperor': 1,
  'united': 6,
  'states': 5,
  'soylent': 4,
  'revolution': 4,
  'pleasurable': 1,
  'git': 33,
  '2.0': 10,
  'getting': 14,
  'work': 43,
  'diversity': 2,
  'google': 151,
  'inferring': 1,
  'status': 9,
  'competence': 1,
  'sig

In [5]:
with open('hn_stories_2014.json') as f:
    file=json.load(f)
stories=file['stories']

In [6]:
stories[1]['created_at']

'2014-05-29T08:23:46Z'

In [7]:
datetime.datetime.strptime(stories[1]['created_at'], '%Y-%m-%dT%H:%M:%SZ')

datetime.datetime(2014, 5, 29, 8, 23, 46)

In [8]:
string='hi whats up'
string.split()

['hi', 'whats', 'up']

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'