# Text summarization - Cosine similarity

# Preparing the environment

In [1]:
import re
import nltk
import string
import numpy as np
import networkx as nx
from nltk.cluster.util import cosine_distance

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [5]:
original_text = """Artificial intelligence is human like intelligence. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations."""
original_text = re.sub(r'\s+', ' ', original_text)
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

# Function to calculate similarity between sentences

- Link: https://en.wikipedia.org/wiki/Cosine_similarity
- Step by step calculations: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/

In [6]:
original_sentences = [sentence for sentence in nltk.sent_tokenize(original_text)]
original_sentences

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

In [7]:
formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
formatted_sentences

['artificial intelligence human like intelligence',
 'study intelligent artificial agents',
 'science engineering produce intelligent machines',
 'solve problems intelligence',
 'related intelligent behavior',
 'developing reasoning machines',
 'learn mistakes successes',
 'artificial intelligence related reasoning everyday situations']

In [8]:
def calculate_sentence_similarity(sentence1, sentence2):
  words1 = [word for word in nltk.word_tokenize(sentence1)]
  words2 = [word for word in nltk.word_tokenize(sentence2)]
  #print(words1)
  #print(words2)

  all_words = list(set(words1 + words2))
  #print(all_words)

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)
  #print(vector1)
  #print(vector2)

  for word in words1: # Bag of words
    #print(word)
    vector1[all_words.index(word)] += 1
  for word in words2:
    vector2[all_words.index(word)] += 1
  
  #print(vector1)
  #print(vector2)

  return 1 - cosine_distance(vector1, vector2)

In [9]:
calculate_sentence_similarity(formatted_sentences[0], formatted_sentences[1])

0.18898223650461365

In [10]:
test = ['human', 'study', 'intelligence', 'agents', 'intelligent', 'artificial', 'like']
test.index('agents')

3

# Function to create the similarity matrix

In [11]:
# The higher the value, the greater the similarity between the sentences
# The more words in common, the greater the similarity

In [12]:
def calculate_similarity_matrix(sentences):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))
  #print(similarity_matrix)
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i == j:
        continue
      similarity_matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])
  return similarity_matrix

In [13]:
calculate_similarity_matrix(formatted_sentences)

array([[0.        , 0.18898224, 0.        , 0.43643578, 0.        ,
        0.        , 0.        , 0.46291005],
       [0.18898224, 0.        , 0.2236068 , 0.        , 0.28867513,
        0.        , 0.        , 0.20412415],
       [0.        , 0.2236068 , 0.        , 0.        , 0.25819889,
        0.25819889, 0.        , 0.        ],
       [0.43643578, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.23570226],
       [0.        , 0.28867513, 0.25819889, 0.        , 0.        ,
        0.        , 0.        , 0.23570226],
       [0.        , 0.        , 0.25819889, 0.        , 0.        ,
        0.        , 0.        , 0.23570226],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.46291005, 0.20412415, 0.        , 0.23570226, 0.23570226,
        0.23570226, 0.        , 0.        ]])

# Function to summarize the texts

- Pagerank algorithm: https://en.wikipedia.org/wiki/PageRank


In [14]:
for i, score in enumerate(original_sentences):
  print(i, score)

0 Artificial intelligence is human like intelligence.
1 It is the study of intelligent artificial agents.
2 Science and engineering to produce intelligent machines.
3 Solve problems and have intelligence.
4 Related to intelligent behavior.
5 Developing of reasoning machines.
6 Learn from mistakes and successes.
7 Artificial intelligence is related to reasoning in everyday situations.


In [15]:
def summarize(text, number_of_sentences, percentage = 0):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  similarity_matrix = calculate_similarity_matrix(formatted_sentences)
  #print(similarity_matrix)

  similarity_graph = nx.from_numpy_array(similarity_matrix)
  #print(similarity_graph.nodes)
  #print(similarity_graph.edges)

  scores = nx.pagerank(similarity_graph)
  #print(scores)
  ordered_scores = sorted(((scores[i], score) for i, score in enumerate(original_sentences)), reverse=True)
  #print(ordered_scores)

  if percentage > 0:
    number_of_sentences = int(len(formatted_sentences) * percentage)

  best_sentences = []
  for sentence in range(number_of_sentences):
    best_sentences.append(ordered_scores[sentence][1])
  
  return original_sentences, best_sentences, ordered_scores

In [16]:
original_sentences, best_sentences, scores = summarize(original_text, 3)

In [17]:
original_sentences

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

In [18]:
best_sentences

['Artificial intelligence is related to reasoning in everyday situations.',
 'Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.']

In [19]:
scores

[(0.21117425706958176,
  'Artificial intelligence is related to reasoning in everyday situations.'),
 (0.1673227400426021, 'Artificial intelligence is human like intelligence.'),
 (0.1455558879715914, 'It is the study of intelligent artificial agents.'),
 (0.12904008009949403, 'Related to intelligent behavior.'),
 (0.1275338306533556,
  'Science and engineering to produce intelligent machines.'),
 (0.10880185703391371, 'Solve problems and have intelligence.'),
 (0.08959232615044063, 'Developing of reasoning machines.'),
 (0.020979020979020983, 'Learn from mistakes and successes.')]

In [20]:
from IPython.core.display import HTML
def visualize(title, sentence_list, best_sentences):
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [21]:
visualize('Artificial intelligence', original_sentences, best_sentences)

# Extracting texts from the Internet

In [22]:
!pip install goose3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting goose3
  Downloading goose3-3.1.14-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setu

In [23]:
from goose3 import Goose
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [24]:
article.cleaned_text

'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content. Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.\n\nText summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.[1] On the other hand, visual content can be summarized using computer vision algorithms. Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.[2][3][4] Video summarization algorithms identify and extract from the original video content the most important frames (key-frames), and/or the most important vi

In [25]:
original_sentences, best_sentences, scores = summarize(article.cleaned_text, 120, 0.2)

In [26]:
(120 / len(original_sentences)) * 100

39.8671096345515

In [27]:
original_sentences

['Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.',
 'Artificial intelligence algorithms are commonly developed and employed to achieve this, specialized for different types of data.',
 'Text summarization is usually implemented by natural language processing methods, designed to locate the most informative sentences in a given document.',
 '[1] On the other hand, visual content can be summarized using computer vision algorithms.',
 'Image summarization is the subject of ongoing research; existing approaches typically attempt to display the most representative images from a given image collection, or generate a video that only includes the most important content from the entire collection.',
 '[2][3][4] Video summarization algorithms identify and extract from the original video content the most important frames (key-frames), and/or t

In [28]:
best_sentences

['The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".',
 '"Summarizing Conceptual Graphs for Automatic Summarization Task".',
 'Some unsupervised summarization approaches are based on finding a "centroid" sentence, which is the mean word vector of all the sentences in the document.',
 'Instead of trying to learn explicit features that characterize keyphrases, the TextRank algorithm[16] exploits the structure of the text itself to determine keyphrases that appear "central" to the text in the same way that PageRank selects important Web pages.',
 'For example, if we rank unigrams and find that "advanced", "natural", "language", and "processing" all get high ranks, then we would look at the original text and see that these words appear consecutively and create a final keyphrase using all four together.',
 

In [29]:
scores

[(0.008489637771710657,
  'The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".'),
 (0.007366461488311927,
  '"Summarizing Conceptual Graphs for Automatic Summarization Task".'),
 (0.007253865726120242,
  'Some unsupervised summarization approaches are based on finding a "centroid" sentence, which is the mean word vector of all the sentences in the document.'),
 (0.006892648140781568,
  'Instead of trying to learn explicit features that characterize keyphrases, the TextRank algorithm[16] exploits the structure of the text itself to determine keyphrases that appear "central" to the text in the same way that PageRank selects important Web pages.'),
 (0.006847047195300363,
  'For example, if we rank unigrams and find that "advanced", "natural", "language", and "processing" all get high ranks, then we would 

In [30]:
visualize(article.title, original_sentences, best_sentences)