In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def get_word_frequencies(text):
    """
    Calculates the frequency of each word in the text
    """
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words]
    freq = nltk.FreqDist(words)
    return freq

In [3]:
def get_sentence_scores(text, freq):
    """
    Calculates the score of each sentence in the text
    """
    sentences = sent_tokenize(text)
    scores = []
    for sentence in sentences:
        sentence_score = 0
        sentence_words = [word.lower() for word in word_tokenize(sentence) if word.isalpha()]
        for word in sentence_words:
            sentence_score += freq[word]
        sentence_score /= len(sentence_words)
        scores.append((sentence, sentence_score))
    return scores

In [4]:
def summarize(text, length):
    """
    Summarizes the text to the specified length using the SumBasic algorithm
    """
    freq = get_word_frequencies(text)
    summary = []
    while len(summary) < length:
        sentence_scores = get_sentence_scores(text, freq)
        top_sentence = max(sentence_scores, key=lambda x: x[1])[0]
        summary.append(top_sentence)
        # update frequency distribution by reducing frequency of words in the selected sentence
        for word in word_tokenize(top_sentence):
            if word.isalpha():
                freq[word.lower()] -= 1
    return ' '.join(summary)

In [5]:
text = """
Astronomers have used the James Webb Space Telescope to peer back in time to the early days of the universe — and they spotted something unexpected.
The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe. The discovery is completely upending existing theories about the origins of galaxies, according to a new study published Wednesday in the journal Nature.
“These objects are way more massive​ than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statement. “We expected only to find tiny, young, baby galaxies at this point in time, but we’ve discovered galaxies as mature as our own in what was previously understood to be the dawn of the universe.”
The telescope observes the universe in infrared light, which is invisible to the human eye, and is capable of detecting the faint light from ancient stars and galaxies. By peering into the distant universe, the observatory can essentially see back in time up to about 13.5 billion years ago. (Scientists have determined the universe is about 13.7 billion years old.)
"""

In [10]:
summary = summarize(text, 3)
print(summary)

(Scientists have determined the universe is about 13.7 billion years old.) The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe. “These objects are way more massive​ than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statement.
