https://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf

In [1]:
import re
import string
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from termcolor import colored

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
text = """Poland, officially the Republic of Poland, is a country in Central Europe. It is divided into 16 administrative provinces called voivodeships, covering an area of 313,931 km2 (121,209 sq mi). Poland has a population of 38 million and is the fifth-most populous member state of the European Union. Warsaw is the nation's capital and largest metropolis. Other major cities include Kraków, Gdańsk, Wrocław, Katowice, Łódź, Poznań, Szczecin and Lublin.

Poland has a temperate transitional climate and its territory traverses the Central European Plain, extending from Baltic Sea in the north to Sudeten and Carpathian Mountains in the south. The longest Polish river is the Vistula, and Poland's highest point is Mount Rysy, situated in the Tatra mountain range of the Carpathians. The country is bordered by Lithuania and Russia to the northeast,[c] Belarus and Ukraine to the east, Slovakia and the Czech Republic to the south, and Germany to the west. It also shares maritime boundaries with Denmark and Sweden.

The history of human activity on Polish soil dates to c. 10,000 BC. Culturally diverse throughout late antiquity, the region became inhabited by tribal Polans who gave Poland its name in the early medieval period. The establishment of statehood in 966 coincided with a pagan ruler of the Polans converting to Christianity under the auspices of the Roman Church. The Kingdom of Poland emerged in 1025 and in 1569 cemented its longstanding association with Lithuania, thus forming the Polish–Lithuanian Commonwealth. It was one of the great powers of Europe at the time, with a uniquely liberal political system that adopted Europe's first modern constitution in 1791.

With the passing of a prosperous Polish Golden Age, the country was partitioned by neighbouring states at the end of the 18th century and regained its independence in 1918 as the Second Polish Republic. In September 1939, the invasion of Poland by Germany and the Soviet Union marked the beginning of World War II, which resulted in the Holocaust and millions of Polish casualties. As a member of the Communist Bloc in the global Cold War, the Polish People's Republic was a founding signatory of the Warsaw Pact. Through the emergence and contributions of the Solidarity movement, the communist government was dissolved and Poland re-established itself as a democratic state in 1989.

Poland is a parliamentary republic, with its bicameral legislature comprising the Sejm and the Senate. It is a developed market and a high income economy. Considered a middle power, Poland has the sixth largest economy in the European Union by GDP (nominal) and the fifth largest by GDP (PPP). It provides a very high standard of living, safety and economic freedom, as well as free university education and a universal health care system. The country has 17 UNESCO World Heritage Sites, 15 of which are cultural. Poland is a founding member state of the United Nations, as well as a member of the World Trade Organization, NATO, and the European Union (including the Schengen Area)."""

In [4]:
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

In [5]:
def process_and_tokenize(text, stopwords_set):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word not in stopwords_set and word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return stem_tokens

In [6]:
sentences = nltk.sent_tokenize(text)

In [7]:
tokenized = process_and_tokenize(sentences[0], stop)
print(tokenized)

['poland', 'offici', 'republ', 'poland', 'countri', 'central', 'europ']


In [8]:
tokenized_text = process_and_tokenize(text, stop)

In [9]:
fdist = nltk.FreqDist(tokenized_text)

In [10]:
fdist.most_common(10)

[('poland', 12),
 ('polish', 6),
 ('republ', 5),
 ('countri', 4),
 ('member', 4),
 ('state', 4),
 ('european', 4),
 ('union', 4),
 ('europ', 3),
 ('largest', 3)]

In [11]:
def calculate_sentence(processed_sentence, most_common_words, distance):
    
    most_common_words_indexes = []
    for word in most_common_words:
        try:
            index = processed_sentence.index(word)
            most_common_words_indexes.append(index)
        except ValueError:
            pass
    most_common_words_indexes.sort()
    
    i = 1
    groups = []
    if len(most_common_words_indexes) > 0:
        group = [most_common_words_indexes[0]]
        while i < len(most_common_words_indexes):
            if most_common_words_indexes[i] - most_common_words_indexes[i-1] < distance:
                group.append(most_common_words_indexes[i])
            else:
                groups.append(group.copy())
                group = [most_common_words_indexes[i]]      
            i = i + 1
        groups.append(group)
           
    highest_score = 0
    for g in groups:
        common_words = len(g)
        all_words = g[-1]-g[0]+1
        score = 1.0*common_words**2/all_words
        if score > highest_score:
            highest_score = score
    
    return highest_score

In [12]:
def summarize(text, n_top_words, stopwords_set, distance, percent_of_most_important_sentences):
    sentences = nltk.sent_tokenize(text)
    processed_sentences = [process_and_tokenize(s, stopwords_set) for s in sentences]
    words = []
    for sent in processed_sentences:
        for word in sent:
            words.append(word)
    fdist = nltk.FreqDist(words)
    most_common_words = [word[0] for word in fdist.most_common(n_top_words)]
    sentence_scores = [] 
    i = 0
    for sent in processed_sentences:
        score = calculate_sentence(sent, most_common_words, distance)
        sentence_scores.append((i,score))
        i = i + 1
    
    n = int(percent_of_most_important_sentences/100 * len(sentences))
    sentence_scores = sorted(sentence_scores, key=lambda s: s[1], reverse=True)[0:n]
    most_important_sentences_indexes = [score[0] for score in sentence_scores]
    return most_important_sentences_indexes
    

In [13]:
indexes = summarize(text, 50, stop, 5, 50)

i = 0
for sent in nltk.sent_tokenize(text):
    if i in indexes:
        print(colored(sent, 'yellow'))
    else:
        print(colored(sent, 'black'))
    i = i + 1

[33mPoland, officially the Republic of Poland, is a country in Central Europe.[0m
[33mIt is divided into 16 administrative provinces called voivodeships, covering an area of 313,931 km2 (121,209 sq mi).[0m
[33mPoland has a population of 38 million and is the fifth-most populous member state of the European Union.[0m
[33mWarsaw is the nation's capital and largest metropolis.[0m
[33mOther major cities include Kraków, Gdańsk, Wrocław, Katowice, Łódź, Poznań, Szczecin and Lublin.[0m
[33mPoland has a temperate transitional climate and its territory traverses the Central European Plain, extending from Baltic Sea in the north to Sudeten and Carpathian Mountains in the south.[0m
[30mThe longest Polish river is the Vistula, and Poland's highest point is Mount Rysy, situated in the Tatra mountain range of the Carpathians.[0m
[33mThe country is bordered by Lithuania and Russia to the northeast,[c] Belarus and Ukraine to the east, Slovakia and the Czech Republic to the south, and Ge