In [152]:
#Steps to create a summarizer:
"""
1. Read the text
2. Tokenize the text
3. Remove stop words
4. Create a frequency table
5. Tokenize the sentences
6. Score the sentences
7. Build the summary
8. Output the summary

"""

import spacy



f = open("text.txt","r")
text = f.read()
f.close()

print(text)

The dog wants to breathe underwater so it will ask the fish how it can get gills.
The fish tells the dog that it cannot grow gills but it can use a scuba mask, scuba masks are good.


In [153]:
#Tokenize the text

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Read the text from the file
with open("text.txt", "r") as file:
        text = file.read()

# Process the text
doc = nlp(text)

# Tokenize the text
tokens = {}
for i,token in enumerate(doc):
        tokens[i] = token.text
    

print(tokens)

{0: 'The', 1: 'dog', 2: 'wants', 3: 'to', 4: 'breathe', 5: 'underwater', 6: 'so', 7: 'it', 8: 'will', 9: 'ask', 10: 'the', 11: 'fish', 12: 'how', 13: 'it', 14: 'can', 15: 'get', 16: 'gills', 17: '.', 18: '\n', 19: 'The', 20: 'fish', 21: 'tells', 22: 'the', 23: 'dog', 24: 'that', 25: 'it', 26: 'can', 27: 'not', 28: 'grow', 29: 'gills', 30: 'but', 31: 'it', 32: 'can', 33: 'use', 34: 'a', 35: 'scuba', 36: 'mask', 37: ',', 38: 'scuba', 39: 'masks', 40: 'are', 41: 'good', 42: '.'}


In [154]:
#Remove stop words and punctuation
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = []
punctuation = []

for i,token in enumerate(doc):
    if token.is_stop :
        #show the stop words
        stop_words.append(token.text)
        del tokens[i]
    elif token.is_punct:
        punctuation.append(token.text)
        del tokens[i]
print(stop_words)
print(punctuation)

print(tokens)






['The', 'to', 'so', 'it', 'will', 'the', 'how', 'it', 'can', 'get', 'The', 'the', 'that', 'it', 'can', 'not', 'but', 'it', 'can', 'a', 'are']
['.', ',', '.']
{1: 'dog', 2: 'wants', 4: 'breathe', 5: 'underwater', 9: 'ask', 11: 'fish', 16: 'gills', 18: '\n', 20: 'fish', 21: 'tells', 23: 'dog', 28: 'grow', 29: 'gills', 33: 'use', 35: 'scuba', 36: 'mask', 38: 'scuba', 39: 'masks', 41: 'good'}


In [155]:
#Create a frequency table
word_freq = {}

for word in tokens.values():
    if word not in word_freq:
        word_freq[word] = 1
    else:
        word_freq[word] += 1
print(word_freq)

max_freq = max(word_freq.values())

for word in word_freq.keys():
    word_freq[word] = word_freq[word]/max_freq
print(word_freq)

{'dog': 2, 'wants': 1, 'breathe': 1, 'underwater': 1, 'ask': 1, 'fish': 2, 'gills': 2, '\n': 1, 'tells': 1, 'grow': 1, 'use': 1, 'scuba': 2, 'mask': 1, 'masks': 1, 'good': 1}
{'dog': 1.0, 'wants': 0.5, 'breathe': 0.5, 'underwater': 0.5, 'ask': 0.5, 'fish': 1.0, 'gills': 1.0, '\n': 0.5, 'tells': 0.5, 'grow': 0.5, 'use': 0.5, 'scuba': 1.0, 'mask': 0.5, 'masks': 0.5, 'good': 0.5}


In [156]:
#Tokenize the sentences

sentences = []

for i,token in enumerate(doc.sents):
    sentences.append(token.text)
print(sentences)

['The dog wants to breathe underwater so it will ask the fish how it can get gills.\n', 'The fish tells the dog that it cannot grow gills but it can use a scuba mask, scuba masks are good.']


In [157]:
#Score the sentences

sentence_scores = {}
for sent in sentences:
    words = sent.split()
    for word in sent:
        if word.lower() in word_freq.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_freq[word.lower()]
            else:
                sentence_scores[sent] += word_freq[word.lower()]
print(sentence_scores)

{'The dog wants to breathe underwater so it will ask the fish how it can get gills.\n': 0.5}
