### Importing required packages

In [42]:
# Load Pkgs
import spacy

In [43]:
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

#### Loading the stop words

In [44]:
# Build a List of Stopwords
stopwords = list(STOP_WORDS)

#### Coronavirus document

In [45]:
doc="""Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.
Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.
The best way to prevent and slow down transmission is be well informed about the COVID-19 virus, the disease it causes and how it spreads. Protect yourself and others from infection by washing your hands or using an alcohol based rub frequently and not touching your face.
The COVID-19 virus spreads primarily through droplets of saliva or discharge from the nose when an infected person coughs or sneezes, so it’s important that you also practice respiratory etiquette (for example, by coughing into a flexed elbow).
At this time, there are no specific vaccines or treatments for COVID-19. However, there are many ongoing clinical trials evaluating potential treatments. WHO will continue to provide updated information as soon as clinical findings become available."""

In [46]:
nlp = spacy.load('en')

##### Calling the nlp object on a string of text will return a processed Doc

In [47]:
docx = nlp(doc)

In [48]:
# Tokenization of Text
mytokens=[]
for token in docx:
    mytokens.append(token.text)

In [49]:
word_frequencies = {}
for word in docx:
    if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

In [50]:
# Maximum Word Frequency
maximum_frequency = max(word_frequencies.values())

In [51]:
for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

In [52]:
# Sentence Tokens
sentence_list = [ sentence for sentence in docx.sents ]


In [53]:
for t in sentence_list:
    for w in t:
        w.text.lower()    

In [54]:
# Sentence Score via comparrng each word with sentence
sentence_scores = {}  
for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

In [55]:
# Sentence Score Table
sentence_scores

{Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.: 3.2222222222222223,
 Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.: 3.5555555555555554,
 Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.: 8.444444444444443,
 The best way to prevent and slow down transmission is be well informed about the COVID-19 virus, the disease it causes and how it spreads.: 3.888888888888889,
 Protect yourself and others from infection by washing your hands or using an alcohol based rub frequently and not touching your face.: 2.4444444444444446,
 At this time, there are no specific vaccines or treatments for COVID-19.: 2.555555555555556,
 However, there are many ongoing clinical trials evaluating potential treatments.: 2.888888888888889

In [56]:
# Import Heapq 
from heapq import nlargest

In [57]:
summarized_sentences = nlargest(5, sentence_scores, key=sentence_scores.get)

In [58]:
l=[]
for w in summarized_sentences:
    l.append(w.text)

In [59]:
summary = ' '.join(l)

In [60]:
summary

'Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.\n The best way to prevent and slow down transmission is be well informed about the COVID-19 virus, the disease it causes and how it spreads. Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.\n Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus. However, there are many ongoing clinical trials evaluating potential treatments.'

In [61]:
len(summary)

646

In [62]:
len(doc)

1191