In [36]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


def _create_frequency_table(text_string) -> dict:
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text_string)
    ps = PorterStemmer()
    freq_table = dict()
    for word in words:
        word = ps.stem(word)
        if word in stop_words:
            continue
        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
    return freq_table


def _score_sentences(sentences, freq_table) -> dict:
    sentence_value = dict()
    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        for word_value in freq_table:
            if word_value in sentence.lower():
                if sentence[:10] in sentence_value:
                    sentence_value[sentence[:10]] += freq_table[word_value]
                else:
                    sentence_value[sentence[:10]] = freq_table[word_value]
        sentence_value[sentence[:10]] = sentence_value[sentence[:10]] // word_count_in_sentence
    return sentence_value


def _find_average_score(sentence_value) -> int:
    sum_values = 0
    for entry in sentence_value:
        sum_values += sentence_value[entry]
    # average value of a sentence from original text
    return int(sum_values / len(sentence_value))


def _generate_summary(sentences, sentence_value, threshold):
    sentence_count = 0
    summary_ = ''
    for sentence in sentences:
        if sentence[:10] in sentence_value and sentence_value[sentence[:10]] > threshold:
            summary_ += ' ' + sentence
            sentence_count += 1
    return summary_


def run_summarization(text):
    freq_table = _create_frequency_table(text)
    sentences = sent_tokenize(text)
    sentence_scores = _score_sentences(sentences, freq_table)
    threshold = _find_average_score(sentence_scores)
    return _generate_summary(sentences, sentence_scores, 1.5 * threshold)


original_paragraph = '''At first glance the idea seems perfectly plausible. Conveying
even simple messages requires that you make completely different observations depending
on your language. Imagine being asked to count some pens on a table. As an English
speaker, you only have to count them and give the number. But a Russian may need to
consider the gender and a Japanese speaker has to take into account their shape (long
and cylindrical) as well, and use the number word designated for items of that shape.
On the other hand, surely pens are just pens, no matter what your language compels you
to specify about them? Little linguistic peculiarities, though amusing, don’t change the
objective world we are describing. So how can they alter the way we think?'''

summary = run_summarization(original_paragraph).strip()

print(len(original_paragraph.split()), len(summary.split()), '\n')
print(summary)

125 60 

At first glance the idea seems perfectly plausible. Conveying
even simple messages requires that you make completely different observations depending
on your language. Imagine being asked to count some pens on a table. As an English
speaker, you only have to count them and give the number. Little linguistic peculiarities, though amusing, don’t change the
objective world we are describing.
