In [1]:
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [2]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

def _score_sentences(tf_idf_matrix) -> dict:
    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [3]:
filename="001.txt"
f = open((filename), "r")
text=f.read()
f.close()

In [4]:
text

'Success from two leading coronavirus vaccine programs likely means other frontrunners will also show strong protection against COVID-19, Bill Gates said Tuesday.\n\nThe fact that two coronavirus vaccines recently showed strong protection against COVID-19 bodes well for other leading programs led by AstraZeneca, Novavax, and Johnson & Johnson, Bill Gates said Tuesday.The billionaire Microsoft founder and philanthropist said it will be easier to boost manufacturing and distribute these other shots to the entire world, particularly developing nations.The vaccine space has seen a flurry of good news in recent days, marked by overwhelming success in late-stage trials by both Pfizer and Moderna. The studies showed both vaccines provided strong protection against the virus compared to a placebo. "With the very good news from Pfizer and Moderna, we think it\'s now likely that AstraZeneca, Novavax, and Johnson & Johnson will also likely show very strong efficacy," Gates told journalist Andrew 

# 1 Sentence Tokenize

In [5]:
sentences = sent_tokenize(text)
total_documents = len(sentences)

# 2 Create the Frequency matrix of the words in each sentence.

In [6]:
freq_matrix = _create_frequency_matrix(sentences)
print(freq_matrix)

{'Success from tw': {'success': 1, 'two': 1, 'lead': 1, 'coronaviru': 1, 'vaccin': 1, 'program': 1, 'like': 1, 'mean': 1, 'frontrunn': 1, 'also': 1, 'show': 1, 'strong': 1, 'protect': 1, 'covid-19': 1, ',': 1, 'bill': 1, 'gate': 1, 'said': 1, 'tuesday': 1, '.': 1}, 'The fact that t': {'fact': 1, 'two': 1, 'coronaviru': 1, 'vaccin': 2, 'recent': 2, 'show': 1, 'strong': 1, 'protect': 1, 'covid-19': 1, 'bode': 1, 'well': 1, 'lead': 1, 'program': 1, 'led': 1, 'astrazeneca': 1, ',': 5, 'novavax': 1, 'johnson': 2, '&': 1, 'bill': 1, 'gate': 1, 'said': 2, 'tuesday.th': 1, 'billionair': 1, 'microsoft': 1, 'founder': 1, 'philanthropist': 1, 'easier': 1, 'boost': 1, 'manufactur': 1, 'distribut': 1, 'shot': 1, 'entir': 1, 'world': 1, 'particularli': 1, 'develop': 1, 'nations.th': 1, 'space': 1, 'ha': 1, 'seen': 1, 'flurri': 1, 'good': 1, 'news': 1, 'day': 1, 'mark': 1, 'overwhelm': 1, 'success': 1, 'late-stag': 1, 'trial': 1, 'pfizer': 1, 'moderna': 1, '.': 1}, 'The studies sho': {'studi': 1, 'sh

# 3 Calculate TermFrequency and generate a matrix

In [7]:
tf_matrix = _create_tf_matrix(freq_matrix)
print(tf_matrix)

{'Success from tw': {'success': 0.05, 'two': 0.05, 'lead': 0.05, 'coronaviru': 0.05, 'vaccin': 0.05, 'program': 0.05, 'like': 0.05, 'mean': 0.05, 'frontrunn': 0.05, 'also': 0.05, 'show': 0.05, 'strong': 0.05, 'protect': 0.05, 'covid-19': 0.05, ',': 0.05, 'bill': 0.05, 'gate': 0.05, 'said': 0.05, 'tuesday': 0.05, '.': 0.05}, 'The fact that t': {'fact': 0.019230769230769232, 'two': 0.019230769230769232, 'coronaviru': 0.019230769230769232, 'vaccin': 0.038461538461538464, 'recent': 0.038461538461538464, 'show': 0.019230769230769232, 'strong': 0.019230769230769232, 'protect': 0.019230769230769232, 'covid-19': 0.019230769230769232, 'bode': 0.019230769230769232, 'well': 0.019230769230769232, 'lead': 0.019230769230769232, 'program': 0.019230769230769232, 'led': 0.019230769230769232, 'astrazeneca': 0.019230769230769232, ',': 0.09615384615384616, 'novavax': 0.019230769230769232, 'johnson': 0.038461538461538464, '&': 0.019230769230769232, 'bill': 0.019230769230769232, 'gate': 0.019230769230769232

# 4 Creating table for documents per words

In [8]:
count_doc_per_words = _create_documents_per_words(freq_matrix)
print(count_doc_per_words)

{'success': 3, 'two': 2, 'lead': 3, 'coronaviru': 3, 'vaccin': 6, 'program': 2, 'like': 2, 'mean': 1, 'frontrunn': 1, 'also': 2, 'show': 5, 'strong': 4, 'protect': 4, 'covid-19': 3, ',': 6, 'bill': 2, 'gate': 5, 'said': 2, 'tuesday': 1, '.': 10, 'fact': 1, 'recent': 1, 'bode': 1, 'well': 1, 'led': 1, 'astrazeneca': 2, 'novavax': 2, 'johnson': 2, '&': 2, 'tuesday.th': 1, 'billionair': 1, 'microsoft': 1, 'founder': 1, 'philanthropist': 1, 'easier': 1, 'boost': 1, 'manufactur': 2, 'distribut': 2, 'shot': 3, 'entir': 1, 'world': 2, 'particularli': 1, 'develop': 1, 'nations.th': 1, 'space': 1, 'ha': 2, 'seen': 1, 'flurri': 1, 'good': 2, 'news': 2, 'day': 1, 'mark': 1, 'overwhelm': 1, 'late-stag': 2, 'trial': 3, 'pfizer': 2, 'moderna': 2, 'studi': 1, 'provid': 1, 'viru': 1, 'compar': 1, 'placebo': 2, '``': 1, 'veri': 1, 'think': 1, "'s": 2, 'efficaci': 1, "''": 1, 'told': 1, 'journalist': 1, 'andrew': 1, 'ross': 1, 'sorkin': 1, "n't": 1, 'delv': 1, 'scientif': 2, 'rational': 1, 'behind': 1, 

# 5 Calculate IDF and generate a matrix

In [9]:
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
print(idf_matrix)

{'Success from tw': {'success': 0.5228787452803376, 'two': 0.6989700043360189, 'lead': 0.5228787452803376, 'coronaviru': 0.5228787452803376, 'vaccin': 0.2218487496163564, 'program': 0.6989700043360189, 'like': 0.6989700043360189, 'mean': 1.0, 'frontrunn': 1.0, 'also': 0.6989700043360189, 'show': 0.3010299956639812, 'strong': 0.3979400086720376, 'protect': 0.3979400086720376, 'covid-19': 0.5228787452803376, ',': 0.2218487496163564, 'bill': 0.6989700043360189, 'gate': 0.3010299956639812, 'said': 0.6989700043360189, 'tuesday': 1.0, '.': 0.0}, 'The fact that t': {'fact': 1.0, 'two': 0.6989700043360189, 'coronaviru': 0.5228787452803376, 'vaccin': 0.2218487496163564, 'recent': 1.0, 'show': 0.3010299956639812, 'strong': 0.3979400086720376, 'protect': 0.3979400086720376, 'covid-19': 0.5228787452803376, 'bode': 1.0, 'well': 1.0, 'lead': 0.5228787452803376, 'program': 0.6989700043360189, 'led': 1.0, 'astrazeneca': 0.6989700043360189, ',': 0.2218487496163564, 'novavax': 0.6989700043360189, 'johns

# 6 Calculate TF-IDF and generate a matrix

In [10]:
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'Success from tw': {'success': 0.026143937264016884, 'two': 0.034948500216800946, 'lead': 0.026143937264016884, 'coronaviru': 0.026143937264016884, 'vaccin': 0.01109243748081782, 'program': 0.034948500216800946, 'like': 0.034948500216800946, 'mean': 0.05, 'frontrunn': 0.05, 'also': 0.034948500216800946, 'show': 0.01505149978319906, 'strong': 0.01989700043360188, 'protect': 0.01989700043360188, 'covid-19': 0.026143937264016884, ',': 0.01109243748081782, 'bill': 0.034948500216800946, 'gate': 0.01505149978319906, 'said': 0.034948500216800946, 'tuesday': 0.05, '.': 0.0}, 'The fact that t': {'fact': 0.019230769230769232, 'two': 0.013441730852615748, 'coronaviru': 0.010055360486160339, 'vaccin': 0.008532644216013708, 'recent': 0.038461538461538464, 'show': 0.005789038378153485, 'strong': 0.007652692474462262, 'protect': 0.007652692474462262, 'covid-19': 0.010055360486160339, 'bode': 0.019230769230769232, 'well': 0.019230769230769232, 'lead': 0.010055360486160339, 'program': 0.01344173085261

# 7 Important Algorithm: score the sentences

In [11]:
sentence_scores = _score_sentences(tf_idf_matrix)
print(sentence_scores)

{'Success from tw': 0.027817431287605544, 'The fact that t': 0.015668854959339104, 'The studies sho': 0.06017728766960432, '"With the very ': 0.032505924637327904, 'While Gates did': 0.06048431212790743, 'All the leading': 0.07366143511744506, 'Early-stage cli': 0.03847296200569854, 'But the only wa': 0.04388265237646744, 'The scientific ': 0.05184885362045636, 'Gates noted tha': 0.05409862428147178}


# 8 Find the threshold

In [12]:
threshold = _find_average_score(sentence_scores)
print(threshold)

0.04586183380833235


# 9 Generate the summary

In [13]:
summary = _generate_summary(sentences, sentence_scores, 1 * threshold)
print(summary)

 The studies showed both vaccines provided strong protection against the virus compared to a placebo. While Gates didn't delve into the scientific rationale behind that prediction, many scientists hold the same hope. All the leading vaccine candidates target the same part of the coronavirus in the spike protein. The scientific success has turned the top challenges surrounding a COVID-19 vaccine to the manufacturing and distribution front. Gates noted that the world will be supply constrained for 2021, but these additional vaccines will prove valuable on that front.
