https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

In [1]:
import re
import string
import nltk
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from termcolor import colored
from scipy.spatial.distance import cosine
import networkx as nx

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
text = """Poland, officially the Republic of Poland, is a country in Central Europe. It is divided into 16 administrative provinces called voivodeships, covering an area of 313,931 km2 (121,209 sq mi). Poland has a population of 38 million and is the fifth-most populous member state of the European Union. Warsaw is the nation's capital and largest metropolis. Other major cities include Kraków, Gdańsk, Wrocław, Katowice, Łódź, Poznań, Szczecin and Lublin.

Poland has a temperate transitional climate and its territory traverses the Central European Plain, extending from Baltic Sea in the north to Sudeten and Carpathian Mountains in the south. The longest Polish river is the Vistula, and Poland's highest point is Mount Rysy, situated in the Tatra mountain range of the Carpathians. The country is bordered by Lithuania and Russia to the northeast,[c] Belarus and Ukraine to the east, Slovakia and the Czech Republic to the south, and Germany to the west. It also shares maritime boundaries with Denmark and Sweden.

The history of human activity on Polish soil dates to c. 10,000 BC. Culturally diverse throughout late antiquity, the region became inhabited by tribal Polans who gave Poland its name in the early medieval period. The establishment of statehood in 966 coincided with a pagan ruler of the Polans converting to Christianity under the auspices of the Roman Church. The Kingdom of Poland emerged in 1025 and in 1569 cemented its longstanding association with Lithuania, thus forming the Polish–Lithuanian Commonwealth. It was one of the great powers of Europe at the time, with a uniquely liberal political system that adopted Europe's first modern constitution in 1791.

With the passing of a prosperous Polish Golden Age, the country was partitioned by neighbouring states at the end of the 18th century and regained its independence in 1918 as the Second Polish Republic. In September 1939, the invasion of Poland by Germany and the Soviet Union marked the beginning of World War II, which resulted in the Holocaust and millions of Polish casualties. As a member of the Communist Bloc in the global Cold War, the Polish People's Republic was a founding signatory of the Warsaw Pact. Through the emergence and contributions of the Solidarity movement, the communist government was dissolved and Poland re-established itself as a democratic state in 1989.

Poland is a parliamentary republic, with its bicameral legislature comprising the Sejm and the Senate. It is a developed market and a high income economy. Considered a middle power, Poland has the sixth largest economy in the European Union by GDP (nominal) and the fifth largest by GDP (PPP). It provides a very high standard of living, safety and economic freedom, as well as free university education and a universal health care system. The country has 17 UNESCO World Heritage Sites, 15 of which are cultural. Poland is a founding member state of the United Nations, as well as a member of the World Trade Organization, NATO, and the European Union (including the Schengen Area)."""

In [4]:
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

In [5]:
def process_and_tokenize(text, stopwords_set):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word not in stopwords_set and word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return stem_tokens

In [6]:
def cosine_similarity(sent_1_tokenized, sent_2_tokenized):
    unique_words = list(set(sent_1_tokenized+sent_2_tokenized))
    unique_words_count = len(unique_words)
    
    vector_1 = [0] * unique_words_count
    vector_2 = [0] * unique_words_count
    
    for t in sent_1_tokenized:
        vector_1[unique_words.index(t)] += 1
    for t in sent_2_tokenized:
        vector_2[unique_words.index(t)] += 1
    return 1-cosine(vector_1, vector_2)

In [7]:
sentences = nltk.sent_tokenize(text)

In [8]:
s1 = process_and_tokenize(sentences[0], stop)
s2 = process_and_tokenize(sentences[1], stop)

In [9]:
print(s1)
print(s2)

['poland', 'offici', 'republ', 'poland', 'countri', 'central', 'europ']
['divid', 'administr', 'provinc', 'call', 'voivodeship', 'cover', 'area', 'sq', 'mi']


In [10]:
cosine_similarity(s1, s2)

0.0

In [11]:
sentences[0]

'Poland, officially the Republic of Poland, is a country in Central Europe.'

In [12]:
sentences[2]

'Poland has a population of 38 million and is the fifth-most populous member state of the European Union.'

In [13]:
s1 = process_and_tokenize(sentences[0], stop)
print(s1)

['poland', 'offici', 'republ', 'poland', 'countri', 'central', 'europ']


In [14]:
s2 = process_and_tokenize(sentences[2], stop)
print(s2)

['poland', 'popul', 'million', 'popul', 'member', 'state', 'european', 'union']


In [15]:
cosine_similarity(s1, s2)

0.21081851067789192

In [16]:
s1 = "very similar sentence"
s2 = "similar sentence to previous"
cosine_similarity(process_and_tokenize(s1, stop), process_and_tokenize(s2,stop))

0.816496580927726

In [17]:
def create_numpy_matrix(sentences_tokenized):
    n = len(sentences_tokenized)
    matrix = np.zeros((n,n))
    for row in range(n):
        for col in range(n):
            if row == col:
                continue
            matrix[row][col] = cosine_similarity(sentences_tokenized[row], sentences_tokenized[col])
    return matrix

In [18]:
sent_tokenized = [process_and_tokenize(s, stop) for s in nltk.sent_tokenize(text)]

In [19]:
print(sent_tokenized)

[['poland', 'offici', 'republ', 'poland', 'countri', 'central', 'europ'], ['divid', 'administr', 'provinc', 'call', 'voivodeship', 'cover', 'area', 'sq', 'mi'], ['poland', 'popul', 'million', 'popul', 'member', 'state', 'european', 'union'], ['warsaw', 'nation', 'capit', 'largest', 'metropoli'], ['major', 'citi', 'includ', 'kraków', 'gdańsk', 'wrocław', 'katowic', 'łódź', 'poznań', 'szczecin', 'lublin'], ['poland', 'temper', 'transit', 'climat', 'territori', 'travers', 'central', 'european', 'plain', 'extend', 'baltic', 'sea', 'north', 'sudeten', 'carpathian', 'mountain', 'south'], ['longest', 'polish', 'river', 'vistula', 'poland', 'highest', 'point', 'mount', 'rysi', 'situat', 'tatra', 'mountain', 'rang', 'carpathian'], ['countri', 'border', 'lithuania', 'russia', 'northeast', 'c', 'belaru', 'ukrain', 'east', 'slovakia', 'czech', 'republ', 'south', 'germani', 'west'], ['also', 'share', 'maritim', 'boundari', 'denmark', 'sweden'], ['histori', 'human', 'activ', 'polish', 'soil', 'date'

In [20]:
matrix = create_numpy_matrix(sent_tokenized)

In [21]:
matrix.shape

(24, 24)

In [22]:
matrix

array([[0.        , 0.        , 0.21081851, 0.        , 0.        ,
        0.24253563, 0.17817416, 0.17213259, 0.        , 0.        ,
        0.16666667, 0.        , 0.21081851, 0.16666667, 0.15713484,
        0.16666667, 0.09245003, 0.21081851, 0.35355339, 0.        ,
        0.15294382, 0.        , 0.13608276, 0.15294382],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.07647191],
       [0.21081851, 0.        , 0.        , 0.        , 0.        ,
        0.153393  , 0.08451543, 0.        , 0.        , 0.        ,
        0.07905694, 0.        , 0.1       , 0.        , 0.0745356 ,
        0.23717082, 0.0877058 , 0.2       , 0.1118034 , 0.        ,
        0.21764288, 0.        , 0.        , 0.43528575],
       [0.   

In [23]:
def page_rank(numpy_matrix):
    graph = nx.from_numpy_array(numpy_matrix)
    ranking = nx.pagerank(graph)
    sorted_indexes = [k for k,v in sorted(ranking.items(), key= lambda item: item[1], reverse=True)]
    return sorted_indexes

In [24]:
print(page_rank(matrix))

[0, 23, 2, 15, 20, 18, 17, 6, 16, 14, 5, 12, 10, 7, 22, 9, 3, 21, 13, 19, 11, 1, 4, 8]


In [25]:
def summarize(text, stopwords_set, percent_of_most_important_sentences):
    sent_tokenized = [process_and_tokenize(s, stopwords_set) for s in nltk.sent_tokenize(text)]
    matrix = create_numpy_matrix(sent_tokenized)
    ranking = page_rank(matrix)
    n = int(percent_of_most_important_sentences/100 * len(sent_tokenized))
    most_import_sent_indexes = ranking[0:n]
    return most_import_sent_indexes

In [26]:
ids = summarize(text, stop, 50)

In [27]:
i = 0
for sent in nltk.sent_tokenize(text):
    if i in ids:
        print(colored(sent, 'yellow'))
    else:
        print(colored(sent, 'black'))
    i = i + 1

[33mPoland, officially the Republic of Poland, is a country in Central Europe.[0m
[30mIt is divided into 16 administrative provinces called voivodeships, covering an area of 313,931 km2 (121,209 sq mi).[0m
[33mPoland has a population of 38 million and is the fifth-most populous member state of the European Union.[0m
[30mWarsaw is the nation's capital and largest metropolis.[0m
[30mOther major cities include Kraków, Gdańsk, Wrocław, Katowice, Łódź, Poznań, Szczecin and Lublin.[0m
[33mPoland has a temperate transitional climate and its territory traverses the Central European Plain, extending from Baltic Sea in the north to Sudeten and Carpathian Mountains in the south.[0m
[33mThe longest Polish river is the Vistula, and Poland's highest point is Mount Rysy, situated in the Tatra mountain range of the Carpathians.[0m
[30mThe country is bordered by Lithuania and Russia to the northeast,[c] Belarus and Ukraine to the east, Slovakia and the Czech Republic to the south, and Ge