# Text summarization - Frequency based algorithm

# Preprocessing the texts

In [1]:
import re # relugar expression
import nltk # natural language toolkit
import string

In [2]:
# I added the word machine at the end of the last sentence
original_text = """Artificial intelligence is human like intelligence.
                   It is the study of intelligent artificial agents.
                   Science and engineering to produce intelligent machines.
                   Solve problems and have intelligence.
                   Related to intelligent behavior.
                   Developing of reasoning machines.
                   Learn from mistakes and successes.
                   Artificial intelligence is related to reasoning in everyday situations."""

In [3]:
original_text

'Artificial intelligence is human like intelligence.\n                   It is the study of intelligent artificial agents.\n                   Science and engineering to produce intelligent machines.\n                   Solve problems and have intelligence.\n                   Related to intelligent behavior.\n                   Developing of reasoning machines.\n                   Learn from mistakes and successes.\n                   Artificial intelligence is related to reasoning in everyday situations.'

In [4]:
original_text = re.sub(r'\s+', ' ', original_text)

In [5]:
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
len(stopwords)

179

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  #print(tokens)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [12]:
formatted_text = preprocess(original_text)
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

# Word frequency

In [13]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

FreqDist({'intelligence': 4, 'artificial': 3, 'intelligent': 3, 'machines': 2, 'related': 2, 'reasoning': 2, 'human': 1, 'like': 1, 'study': 1, 'agents': 1, ...})

In [14]:
word_frequency['intelligence']

4

In [15]:
word_frequency.keys()

dict_keys(['artificial', 'intelligence', 'human', 'like', 'study', 'intelligent', 'agents', 'science', 'engineering', 'produce', 'machines', 'solve', 'problems', 'related', 'behavior', 'developing', 'reasoning', 'learn', 'mistakes', 'successes', 'everyday', 'situations'])

In [16]:
len(word_frequency.keys())

22

In [17]:
highest_frequency = max(word_frequency.values())
highest_frequency

4

In [18]:
for word in word_frequency.keys():
  #print(word)
  word_frequency[word] = (word_frequency[word] / highest_frequency)

In [19]:
word_frequency

FreqDist({'intelligence': 1.0, 'artificial': 0.75, 'intelligent': 0.75, 'machines': 0.5, 'related': 0.5, 'reasoning': 0.5, 'human': 0.25, 'like': 0.25, 'study': 0.25, 'agents': 0.25, ...})

# Sentence tokenization

In [20]:
'Phd John went home. He arrived early.'.split('.')

['Phd John went home', ' He arrived early', '']

In [21]:
'Ph.d John went home. He arrived early.'.split('.')

['Ph', 'd John went home', ' He arrived early', '']

In [22]:
nltk.sent_tokenize('Ph.d John went home. He arrived early.')

['Ph.d John went home.', 'He arrived early.']

In [23]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

# Generate the summary (score for sentences)

In [24]:
word_frequency

FreqDist({'intelligence': 1.0, 'artificial': 0.75, 'intelligent': 0.75, 'machines': 0.5, 'related': 0.5, 'reasoning': 0.5, 'human': 0.25, 'like': 0.25, 'study': 0.25, 'agents': 0.25, ...})

In [25]:
score_sentences = {}
for sentence in sentence_list:
  #print(sentence)
  for word in nltk.word_tokenize(sentence.lower()):
    #print(word)
    if sentence not in score_sentences.keys():
      score_sentences[sentence] = word_frequency[word]
    else:
      score_sentences[sentence] += word_frequency[word]

In [26]:
score_sentences

{'Artificial intelligence is human like intelligence.': 3.25,
 'It is the study of intelligent artificial agents.': 2.0,
 'Science and engineering to produce intelligent machines.': 2.0,
 'Solve problems and have intelligence.': 1.5,
 'Related to intelligent behavior.': 1.5,
 'Developing of reasoning machines.': 1.25,
 'Learn from mistakes and successes.': 0.75,
 'Artificial intelligence is related to reasoning in everyday situations.': 3.25}

In [27]:
score_sentences['Solve problems and have intelligence.']

1.5

In [28]:
score_sentences.keys()

dict_keys(['Artificial intelligence is human like intelligence.', 'It is the study of intelligent artificial agents.', 'Science and engineering to produce intelligent machines.', 'Solve problems and have intelligence.', 'Related to intelligent behavior.', 'Developing of reasoning machines.', 'Learn from mistakes and successes.', 'Artificial intelligence is related to reasoning in everyday situations.'])

In [29]:
import heapq
best_sentences = heapq.nlargest(3, score_sentences, key = score_sentences.get)

In [30]:
best_sentences

['Artificial intelligence is human like intelligence.',
 'Artificial intelligence is related to reasoning in everyday situations.',
 'It is the study of intelligent artificial agents.']

In [31]:
summary = ' '.join(best_sentences)
summary

'Artificial intelligence is human like intelligence. Artificial intelligence is related to reasoning in everyday situations. It is the study of intelligent artificial agents.'

In [32]:
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

# Visualizing the summary in HTML

In [33]:
from IPython.core.display import HTML

In [34]:
text = ''
display(HTML(f'<h2>Summary</h2>'))
for sentence in sentence_list:
  #print(sentence)
  #text += sentence
  if sentence in best_sentences:
    text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>")
  else:
    text += ' ' + sentence

display(HTML(f"""{text}"""))

# Extracting texts from the Internet

In [35]:
!pip install goose3

Collecting goose3
  Downloading goose3-3.1.17-py3-none-any.whl (88 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/88.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m61.4/88.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.7/88.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect (from goose3)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting langdetect (from goose3)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyahocorasick (from goose3)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━

In [36]:
from goose3 import Goose

In [37]:
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [38]:
article.infos

{'meta': {'description': '',
  'lang': 'en',
  'keywords': '',
  'favicon': '/static/apple-touch/wikipedia.png',
  'canonical': 'https://en.wikipedia.org/wiki/Automatic_summarization',
  'encoding': 'UTF-8'},
 'image': None,
 'domain': 'en.wikipedia.org',
 'title': 'Automatic summarization - Wikipedia',
 'cleaned_text': 'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content. Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.\n\nText summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.[1] On the other hand, visual content can be summarized using computer vision algorithms. Image summarization is the subject of ongoing research; existing approaches typically attempt to dis

In [39]:
article.title

'Automatic summarization - Wikipedia'

In [40]:
article.cleaned_text

'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content. Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.\n\nText summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.[1] On the other hand, visual content can be summarized using computer vision algorithms. Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.[2][3][4] Video summarization algorithms identify and extract from the original video content the most important frames (key-frames), and/or the most important vi

In [41]:
len(article.cleaned_text)

35232

In [42]:
formatted_article = preprocess(article.cleaned_text)
formatted_article

"automatic summarization process shortening set data computationally create subset summary represents important relevant information within original content artificial intelligence algorithms commonly developed employed achieve specialized different types data text summarization usually implemented natural language processing methods designed locate informative sentences given document 1 hand visual content summarized using computer vision algorithms image summarization subject ongoing research existing approaches typically attempt display representative images given image collection generate video includes important content entire collection 2 3 4 video summarization algorithms identify extract original video content important frames key-frames and/or important video segments key-shots normally temporally ordered fashion 5 6 7 8 video summaries simply retain carefully selected subset original video frames therefore identical output video synopsis algorithms new video frames synthesize

In [43]:
len(formatted_article)

26932

In [44]:
def summarize(text, number_of_sentences, percentage = 0):
  original_text = text
  formatted_text = preprocess(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)

  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences

In [45]:
len(sentence_list)

8

In [46]:
(120 / len(sentence_list)) * 100

1500.0

In [47]:
sentence_list, best_sentences, word_frequency, score_sentences = summarize(article.cleaned_text, 100)

In [48]:
sentence_list

['Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.',
 'Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.',
 'Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.',
 '[1] On the other hand, visual content can be summarized using computer vision algorithms.',
 'Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.',
 '[2][3][4] Video summarization algorithms identify and extract from the original video content the most important frames (key-frames), and/or t

In [49]:
best_sentences

['For example, in a text about machine learning, the unigram "learning" might co-occur with "machine", "supervised", "un-supervised", and "semi-supervised" in four different sentences.',
 'For example, if we rank unigrams and find that "advanced", "natural", "language", and "processing" all get high ranks, then we would look at the original text and see that these words appear consecutively and create a final keyphrase using all four together.',
 'Consider the example text from a news article:\n\nA keyphrase extractor might select "Army Corps of Engineers", "President Bush", "New Orleans", and "defective flood-control pumps" as keyphrases.',
 'The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".',
 'Similarly, if the text contains the phrase "supervised classification", then there would be an edge betwee

In [50]:
word_frequency

FreqDist({'``': 1.0, 'summarization': 0.7222222222222222, 'text': 0.4444444444444444, 'keyphrases': 0.2962962962962963, 'sentences': 0.28703703703703703, 'summary': 0.26851851851851855, 'summaries': 0.25, 'document': 0.24074074074074073, 'submodular': 0.23148148148148148, 'extraction': 0.2037037037037037, ...})

In [51]:
score_sentences

{'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.': 2.148148148148148,
 'Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.': 0.5092592592592592,
 'Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.': 1.7777777777777781,
 '[1] On the other hand, visual content can be summarized using computer vision algorithms.': 0.5925925925925926,
 'Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.': 1.8703703703703707,
 '[2][3][4] Video summarization algorithms id

In [52]:
def visualize(title, sentence_list, best_sentences):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [53]:
visualize(article.title, sentence_list, best_sentences)

# Summarizing multiple texts

In [54]:
article_list = ['https://en.wikipedia.org/wiki/Automatic_summarization',
                'https://en.wikipedia.org/wiki/Natural_language_processing',
                'https://en.wikipedia.org/wiki/Lemmatisation']

In [55]:
for url in article_list:
  #print(article)
  g = Goose()
  article = g.extract(url)
  sentence_list, best_sentences, _, _ = summarize(article.cleaned_text, 100, percentage=0.5)
  #print(len(sentence_list), len(best_sentences))
  visualize(article.title, sentence_list, best_sentences)