## Text Summarizer

In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dwight\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
from collections import Counter
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest

In [22]:
STOPWORDS = set(stopwords.words('english') + list(punctuation))
MIN_WORD_PROP, MAX_WORD_PROP = 0.1, 0.9

def compute_word_frequencies(word_sentences):
    words = [word for sentence in word_sentences 
                     for word in sentence 
                         if word not in STOPWORDS]
    counter = Counter(words)
    limit = float(max(counter.values()))
    word_frequencies = {word: freq/limit 
                                for word,freq in counter.items()}
    # Drop words if too common or too uncommon
    word_frequencies = {word: freq 
                            for word,freq in word_frequencies.items() 
                                if freq > MIN_WORD_PROP 
                                and freq < MAX_WORD_PROP}
    return word_frequencies

def sentence_score(word_sentence, word_frequencies):
    return sum([ word_frequencies.get(word,0) 
                    for word in word_sentence])

def summarize(text:str, num_sentences=3):
    """
    Summarize the text, by return the most relevant sentences
     :text the text to summarize
     :num_sentences the number of sentences to return
    """
    # Make the text lowercase
    text = text.lower()
    
    # Break text into sentences 
    sentences = sent_tokenize(text)
    
    # Break sentences into words
    word_sentences = [word_tokenize(sentence) 
                          for sentence in sentences]
    
    # Compute the word frequencies
    word_frequencies = compute_word_frequencies(word_sentences)
    
    # Calculate the scores for each of the sentences
    scores = [sentence_score(word_sentence, word_frequencies)
                     for word_sentence in word_sentences]
    sentence_scores = list(zip(sentences, scores))
    
    # Rank the sentences
    top_sentence_scores = nlargest(num_sentences, 
                                   sentence_scores,
                                   key=lambda t: t[1])
    
    # Return the top sentences
    return [t[0] for t in top_sentence_scores]

In [18]:
with open('data/PolarVortex.txt', 'r') as vortex_file:
    vortex_article = vortex_file.read()

In [19]:
vortex_article

'On the coldest day in two decades on his fifth-generation dairy farm, Chris Pollack grabbed a thick black hose from the barn and ventured into the subzero cold,\nwhere his beef cattle were chomping cud and waiting for water.\nThe power had briefly gone out the previous morning, long enough to freeze the line that automatically fills the animalsâ€™ heated water trough. Pollack was here to replace it.\n\n"Are you serious?" Pollack said, peering inside the black hose. "Thereâ€™s water frozen in the end already."\nHe lifted it up to a small space heater and waited for it to thaw.\nSuch is life in the Deep Freeze of 2019.\nThe past 48 hours in the American Midwest have been about endurance, as a breathtaking cold settled in over a massive stretch of the country. \nThe record-setting frigid temperatures, some of the coldest on the planet Thursday, have frozen the Great Lakes, taxed electrical and natural gas infrastructure,\n endangered livestock and tested the mettle of millions who are us

In [24]:
len(sent_tokenize(vortex_article))

12

In [20]:
summarize(vortex_article)

['in some areas thursday, temperatures dropped below minus-50 degrees, and the extreme weather was blamed for several deaths across the region,\n including people who appear to have frozen to death in milwaukee, detroit and rochester, minn.\nfrom minnesota to new york, the polar vortex again prompted school closures, mail service interruptions and thousands of flight cancellations, \nmany of them in and out of chicago, which appeared otherworldly in a coating of frost and ice.',
 'the record-setting frigid temperatures, some of the coldest on the planet thursday, have frozen the great lakes, taxed electrical and natural gas infrastructure,\n endangered livestock and tested the mettle of millions who are used to the cold but had never experienced anything like this.',
 'on the coldest day in two decades on his fifth-generation dairy farm, chris pollack grabbed a thick black hose from the barn and ventured into the subzero cold,\nwhere his beef cattle were chomping cud and waiting for wa

In [26]:
summarize(vortex_article, num_sentences=1)

['in some areas thursday, temperatures dropped below minus-50 degrees, and the extreme weather was blamed for several deaths across the region,\n including people who appear to have frozen to death in milwaukee, detroit and rochester, minn.\nfrom minnesota to new york, the polar vortex again prompted school closures, mail service interruptions and thousands of flight cancellations, \nmany of them in and out of chicago, which appeared otherworldly in a coating of frost and ice.']