## Guided Project: Hacker News Pipeline
* **About:** We will build a python pipeline to run a sequence of basic natural language processing tasks on a dataset from Hacker News top stories in 2014.
* **Goal:** Find the top 100 keywords of Hacker News posts in 2014.

In [1]:
##########################
# Importing Pipeline class
##########################
from pipeline import Pipeline
pipeline = Pipeline()

In [2]:
#######################
# Loading the JSON Data
#######################
import json
@pipeline.task()
def file_to_json():
    file = open("hn_stories_2014.json")
    
    #returning a list of dict objects
    return json.load(file)["stories"]

In [3]:
####################################
# Filtering the most popular stories
####################################
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    #returning a generator of dict objects
    for story in stories:
        if not story["title"].lower().startswith('ask hn'):
            if story["points"] > 50:
                if story["num_comments"] > 1:
                    yield story

In [4]:
################
# Convert to CSV
################
from pipeline import build_csv
import io
from datetime import datetime
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories_filtered):
    
    # building a generator kepping only the relevant fields
    lines = ([story["objectID"],
              #Parsing the created_at field using datetime.datetime
              datetime.strptime(story["created_at"], "%Y-%m-%dT%H:%M:%SZ"),
              story["url"],
              story["points"],
              story["title"]] for story in stories_filtered
             )
    
    #returning the file object
    return build_csv(lines,
                     header=['objectID', 'created_at', 'url', 'points', 'title'],
                     file = io.StringIO()
                    )

In [5]:
######################
# Extract Title Column
######################
import csv
@pipeline.task(depends_on=json_to_csv)
def extract_titles(file):
    reader = csv.reader(file)
    next(reader) # skiping the header
    
    #returning a generator of str objects
    return (line[4] for line in reader)

In [6]:
# Clean the Titles
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        
        # converting each title to lower case
        title = title.lower()
        
        # removing punctuation from each title using string.punctuation
        cleaned_string = ""
        for char in title:
            if char not in string.punctuation:
                cleaned_string += char
        
        #returning a generator of str objects
        yield cleaned_string

In [7]:
# Create the Word Frequency Dictionary
from stop_words import stop_words
import string

@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(cleaned_titles):
    keyword_dictionary = {}
    
    for title in cleaned_titles:
        for word in title.split():
            if word and word not in stop_words:
                if word not in keyword_dictionary:
                    keyword_dictionary[word] = 0
                keyword_dictionary[word.lower()] += 1
    return keyword_dictionary

In [8]:
# Sort the Top Words
@pipeline.task(depends_on=build_keyword_dictionary)
def get_top_words(keyword_dictionary):
    list_words = []
    
    # converting dictionary to list
    for word, frequency in keyword_dictionary.items():
        list_words.append((word, frequency))
    
    # sorting the list from most used to least used
    return sorted(list_words, key=lambda tup: tup[1], reverse=True)[:100]

In [9]:
# running the pipeline
completed = pipeline.run()
print(completed[get_top_words])

[('new', 185), ('google', 167), ('bitcoin', 101), ('open', 92), ('programming', 90), ('web', 88), ('data', 85), ('video', 79), ('python', 76), ('code', 72), ('facebook', 71), ('released', 71), ('using', 70), ('2013', 65), ('javascript', 65), ('free', 64), ('source', 64), ('game', 63), ('internet', 62), ('microsoft', 59), ('c', 59), ('linux', 58), ('app', 57), ('pdf', 55), ('work', 54), ('language', 54), ('software', 52), ('2014', 52), ('startup', 51), ('apple', 50), ('use', 50), ('make', 50), ('time', 48), ('yc', 48), ('security', 48), ('nsa', 45), ('github', 45), ('windows', 44), ('1', 41), ('world', 41), ('way', 41), ('like', 41), ('project', 40), ('computer', 40), ('heartbleed', 40), ('git', 37), ('users', 37), ('dont', 37), ('design', 37), ('ios', 37), ('developer', 36), ('os', 36), ('twitter', 36), ('ceo', 36), ('vs', 36), ('life', 36), ('big', 35), ('day', 35), ('android', 34), ('online', 34), ('years', 33), ('simple', 33), ('court', 33), ('guide', 32), ('learning', 32), ('mt', 3

## Assessement
* Some of the most relevant keywords of Hacker News posts in 2014:
    * google
    * bitcoin
    * web
    * data
    * python
    * facebook
    * javascript