### Abstract Extraction
* Find the most important words (word importance = word frequency)
* Compute a significance score for sentences based on words they contain (sentence importance = sum(word importance))
* Pick the top most significant sentences

### Downloading an Article

In [22]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#downloading the page
articleURL = "https://phys.org/news/2017-09-astronomers-mystery-white-dwarf-mass.html"
page = urlopen(articleURL).read().decode('utf8', 'ignore')
soup = BeautifulSoup(page, 'lxml')
soup

<!DOCTYPE html>
<html>
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=1024" name="viewport"/>
<title>Astronomers resolve mystery of white dwarf's mass</title>
<meta content="Physics News, Science news, Technology News, Physics, Materials, Nanotech, Technology, Science" http-equiv="Keywords" name="Keywords"/>
<meta content="New observations of the white dwarf/red dwarf binary star 40 Eridani BC by astronomers at the U.S. Naval Observatory (USNO) have revealed new, definitive values for the orbital period and masses of the components of this ..." http-equiv="Description" name="Description"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="en-us" http-equiv="content-language"/>
<meta content="INDEX,FOLLOW" name="ROBOTS"/>
<meta content="noodp" name="ROBOTS"/>
<link href="https://phys.org/news/2017-09-astronomers-mystery-white-dwarf-mass.html" rel="canonical"/>
<link href="https://m.phys.org/news/2017-09-astronomers-my

In [23]:
# soup.find('article')

In [24]:
# soup.find('article').text

In [25]:
#get separate words from article
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))

# removing special characters
text.encode('ascii', 'ignore')
text.replace("?", "")
text.replace('\n', '')

'        This figure illustrates the new orbital solution, plotted together with all published data in the Washington Double Star database as well as the heretofore unpublished data in the recent speckle measurements. In this figure, micrometric observations are indicated by green plus signs, photographic measures by purple asterisks, adaptive optics by blue filled circles, CCD measures by purple triangles and the four new speckle measures as blue stars. A dot-dash line indicates the line of nodes, and a curved arrow in the lower right corner indicates the direction of orbital motion. The scale, in arcseconds, is given on the left and bottom axis. Finally, the previous orbit calculation is shown as a dashed ellipse. Credit: U.S. Naval Observatory    New observations of the white dwarf/red dwarf binary star 40 Eridani BC by astronomers at the U.S. Naval Observatory (USNO) have revealed new, definitive values for the orbital period and masses of the components of this interesting stellar

In [26]:
import string

def getText(url):
    page = urlopen(url).read().decode('utf8', 'ignore')
    soup = BeautifulSoup(page, 'lxml')
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    text.encode('ascii', 'ignore')
    #text = text.translate(string.make ("\n\t\r", "   "))
    text = text.replace('\xa0', ' ')
    text = ' '.join(text.split())
    return text

In [27]:
text = getText(articleURL)

In [28]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sents = sent_tokenize(text)

In [29]:
word_sent = word_tokenize(text.lower())
_stopwords = set(stopwords.words('english') + list(punctuation))
word_sent = [word for word in word_sent if word not in _stopwords]
word_sent

['figure',
 'illustrates',
 'new',
 'orbital',
 'solution',
 'plotted',
 'together',
 'published',
 'data',
 'washington',
 'double',
 'star',
 'database',
 'well',
 'heretofore',
 'unpublished',
 'data',
 'recent',
 'speckle',
 'measurements',
 'figure',
 'micrometric',
 'observations',
 'indicated',
 'green',
 'plus',
 'signs',
 'photographic',
 'measures',
 'purple',
 'asterisks',
 'adaptive',
 'optics',
 'blue',
 'filled',
 'circles',
 'ccd',
 'measures',
 'purple',
 'triangles',
 'four',
 'new',
 'speckle',
 'measures',
 'blue',
 'stars',
 'dot-dash',
 'line',
 'indicates',
 'line',
 'nodes',
 'curved',
 'arrow',
 'lower',
 'right',
 'corner',
 'indicates',
 'direction',
 'orbital',
 'motion',
 'scale',
 'arcseconds',
 'given',
 'left',
 'bottom',
 'axis',
 'finally',
 'previous',
 'orbit',
 'calculation',
 'shown',
 'dashed',
 'ellipse',
 'credit',
 'u.s.',
 'naval',
 'observatory',
 'new',
 'observations',
 'white',
 'dwarf/red',
 'dwarf',
 'binary',
 'star',
 '40',
 'eridani',


### Construct a frequency distribution of words

In [30]:
from nltk.probability import FreqDist
freq = FreqDist(word_sent)

In [31]:
from heapq import nlargest
nlargest(10, freq, key = freq.get)

['star', 'dwarf', 'white', '2018', 'stars', 'new', '``', "''", '40', '...']

### Using default dicionary to rank sentences

In [32]:
from collections import defaultdict
ranking = defaultdict(int)

for i,sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]
            
sents_idx = nlargest(4, ranking, key = ranking.get)
sents_idx

[36, 35, 40, 44]

In [33]:
[sents[j] for j in sorted(sents_idx)]

["20 hours ago Sun's magnetic field Aug 30, 2018 Synchronous orbits Aug 30, 2018 More from Astronomy and Astrophysics Astrophysicist predicts detached, eclipsing white dwarfs to merge into exotic star August 18, 2017 A University of Oklahoma astrophysicist, Mukremin Kilic, and his team have discovered two detached, eclipsing double white dwarf binaries with orbital periods of 40 and 46 minutes, respectively.",
 'White dwarfs are the remnants ... A very rare discovery: Failed star orbits a dead star every 71 minutes June 9, 2017 An international team of astronomers using data from the rejuvenated Kepler space telescope have discovered a rare gem: A binary system consisting of a failed star, also known as a brown dwarf, and the remnant of a dead star ... Binary white dwarf stars May 4, 2011 (PhysOrg.com) -- When a star like our sun gets to be very old, after another seven billion years or so, it will no longer be able to sustain burning its nuclear fuel.',
 "The new findings, presented F