# Guided Project: Hacker News Pipeline

In [88]:
from pipeline import Pipeline, build_csv
import json
import io
from datetime import datetime
import csv
import string
from stop_words import stop_words

In [89]:
# Instantiate an instance of the Pipeline class
pipeline = Pipeline()

In [90]:
@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json') as f:
        f = f.read()
        stories_dict = json.loads(f)
        stories = stories_dict['stories']
    return stories

In [91]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories_list):
    for story in stories_list:
        if story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN'):
            yield story
    yield story



In [92]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(filtered_stories):
    lines = []
    for line in filtered_stories:
        line = (line['objectID'], datetime.strptime(line["created_at"], "%Y-%m-%dT%H:%M:%SZ"), line['url'], line['points'], line['title'])
        lines.append(line)
    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())
    

In [93]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file): 
    data = csv.reader(csv_file)
    for row in data:
        yield row[-1]
    

In [94]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        cleaned_title = ''.join(l for l in title if l not in string.punctuation)
        cleaned_title = cleaned_title.lower()
        yield cleaned_title

In [95]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(cleaned_titles):
    frequency_dict = {}
    for title in cleaned_titles:
        splitted_title = title.split()
        for word in splitted_title:
            if word and word not in stop_words:
                if word not in frequency_dict:
                    frequency_dict[word]=1
                else:
                    frequency_dict[word]+=1
    return frequency_dict

In [96]:
@pipeline.task(depends_on=build_keyword_dictionary)
def top_words(freq_dict):
    sorted_dict = sorted(freq_dict.items(), key = lambda x: x[1], reverse=True)
    return sorted_dict[:100]

In [97]:
# top_words(build_keyword_dictionary(clean_titles(extract_titles(json_to_csv(filter_stories(file_to_json('hn_stories_2014.json')))))))

In [98]:
run_tasks = pipeline.run()

In [99]:
print(run_tasks[top_words])

[('new', 185),
 ('google', 167),
 ('bitcoin', 101),
 ('open', 92),
 ('programming', 90),
 ('web', 88),
 ('data', 85),
 ('video', 79),
 ('python', 76),
 ('code', 72),
 ('facebook', 71),
 ('released', 71),
 ('using', 70),
 ('2013', 65),
 ('javascript', 65),
 ('free', 64),
 ('source', 64),
 ('game', 63),
 ('internet', 62),
 ('microsoft', 59),
 ('c', 59),
 ('linux', 58),
 ('app', 57),
 ('pdf', 55),
 ('work', 54),
 ('language', 54),
 ('software', 52),
 ('2014', 52),
 ('startup', 51),
 ('apple', 50),
 ('use', 50),
 ('make', 50),
 ('time', 48),
 ('yc', 48),
 ('security', 48),
 ('nsa', 45),
 ('github', 45),
 ('windows', 44),
 ('1', 41),
 ('world', 41),
 ('way', 41),
 ('like', 41),
 ('project', 40),
 ('computer', 40),
 ('heartbleed', 40),
 ('git', 37),
 ('users', 37),
 ('dont', 37),
 ('design', 37),
 ('ios', 37),
 ('developer', 36),
 ('os', 36),
 ('twitter', 36),
 ('ceo', 36),
 ('vs', 36),
 ('life', 36),
 ('big', 35),
 ('day', 35),
 ('android', 34),
 ('online', 34),
 ('years', 33),
 ('simple', 