In [1]:
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def parse_hn_article(url):
    response = urllib.request.urlopen(url)
    raw_byte_stream = response.read()
    parsed_html = BeautifulSoup.BeautifulSoup(raw_byte_stream, 'html.parser')
    comments = [tag.text for tag in parsed_html.find_all(class_='commtext')]
    return comments

In [3]:
import re, string

regex = re.compile('[%s]' % re.escape(string.punctuation))

def has_alpha(s):
    return re.search('[a-zA-Z]', s) is not None

def has_punctuation(s):
    return re.search(f"[{string.punctuation}]", s)

def has_number(s):
    return re.search("[0-9]", s)

def process_word(w, stemmer, swords):
    wl = w.lower()
    if wl in swords:
        return None
    if not has_alpha(wl) or len(wl) is 1:
        return None
    if (has_punctuation(wl) or has_number(wl)) and len(wl) is 2:
        return None
    w = stemmer.stem(wl)
    if w in swords:
        return None
    return w


In [4]:
def create_freq_dict(comments) -> dict:
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(' '.join(comments))
    stemmer = PorterStemmer()
    freq = dict()
    for word in words:
        w = process_word(word, stemmer, stop_words)
        if w is None:
            continue
        elif w in freq:
            freq[w] += 1
        else:
            freq[w] = 1
    return freq

In [5]:
def sentence_scores(sentences, freq_table) -> dict:   
    weights = dict()
    for sent in sentences:
        word_count = 0
        uid = hash(sent)
        for word_weight in freq_table:
            if word_weight in sent.lower():
                word_count += 1
                if uid in weights:
                    weights[uid] += freq_table[word_weight]
                else:
                    weights[uid] = freq_table[word_weight]
        if uid in weights:
            weights[uid] = weights[uid] / word_count
    return weights

In [6]:
def average_sentence_score(weights) -> int:
    return sum([weights[w] for w in weights]) / len(weights)

In [7]:
def generate_summary(sentences, weights, threshold, sent_count=20):
    sntces = {}
    scores = {}
    for sent in sentences:
        uid = hash(sent)
        if uid in weights and weights[uid] >= threshold:
            sntces[uid] = sent
            scores[uid] = weights[uid]
    sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    keys = list(sorted_scores.keys())
    summary = []
    for i in range(min(sent_count, len(keys))):
        uid = keys[i]
        sent = sntces[uid]
        summary.append(sent)
    return ' '.join(summary)

In [8]:
comments = parse_hn_article('https://news.ycombinator.com/item?id=22552632')

In [9]:
def summarize_hn_thread(comments):
    frequency_table = create_freq_dict(comments)
    comment_sentences = [sent_tokenize(c) for c in comments]
    sents = [sent for sentences in comment_sentences for sent in sentences]
    scores = sentence_scores(sents, frequency_table)
    threshold = average_sentence_score(scores)
    summary = generate_summary(sents, scores, 2 * threshold)
    return summary

summ = summarize_hn_thread(comments)

In [10]:
summ

"You can't. I'm guessing that isn't you. You can't sell what you don't have. No, it wasn't secrecy that killed them. Nobody wanted it. None. I have both a Magic Leap One and a HoloLens 1. Their problem hasn't been timing. i don't know what your company is. That you don't see it hyped on TV doesn't mean it is not there. I don't want a voice telling me what to do. The goal isn't to own a product. Don't write off VR just yet. It uses magnetic tracking and it just doesn't do well. Oh man, I didn't know he worked there... don't you need to be able to see what's actually going on and not be in VR land? I've got news for Magic Leap. General Magic, GetMagic.com, and now this (not that we didn't see it coming)... AR has no use case. > AR has no use case."

In [11]:
comments = parse_hn_article('https://news.ycombinator.com/item?id=22547283')

summarize_hn_thread(comments)

'They couldn\'t do it. I just don\'t know very many. The US hasn\'t even started testing people. Don\'t defend this. I did just that. Much like in the US. Less than 50 people have the virus. The Chinese are just people, like the rest of us. That didn\'t work out too well. If you don\'t mind, Where did you find this information? Haven\'t seen much over at ORF.at. I don\'t even look at it anymore. If people need stuff, they’ll go out. It isn\'t an "online forum". It isn\'t hard to parse at all. Isn\'t this happening in many places? Japan didn\'t really test much. The fact that people outside of China don\'t know what WeChat is make sense. Regulation doesn\'t have to be binary. It likely will.'