In [1]:
# Text Summarization Based on Thrshold value

# Step 1: Get the Corpus data and Senetnce Tokenize it
#    ex : sentences = ['woman education catch term refers state tertiary health education girl woman',
#                      'million girl school globe majority developing country'....etc]

# Step 2: Calculate the TF_IDF information each sentence

# Step 3: Calculate Score for indivisual Sentence:  Sum of TFIDF of  words in sentence / Total number of words in senetnce

# Step 4: Find the Threshould = Sum of Sentence Score /Total number of senetnces 

# Step 5: Select or Concate or summarize the sentences based on the consition if score s greater than the (1.3 * threshould)

In [30]:
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, PorterStemmer

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            #word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [31]:

def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [32]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [33]:
import math

def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table
        
    return idf_matrix

In [34]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [35]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [36]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [37]:
sentences = ['woman education catch term refers state tertiary health education girl woman',
             'million girl school globe majority developing country',
             'country world developing country take step improve condition education woman play role nation development',
             'consider society men stem support tree face element woman root',
             'root tree spreading branch sheltering protecting needy',
             'woman soul society society judged way woman treated',
             'educated man go make society educated woman go stay home make house occupant',
             'woman play role society mother wife sister care taker nurse etc',
             'need others understanding structure',
             'educated mother make child educated weigh education girl child boy',
             'history evidence society woman treated men educated grew',
             'mistake leave woman goal development achieved gender allowed opportunity education area',
             'education make woman become right raise voice exploitation violence',
             'society cannot progress woman weep',
             'weapon education carve path family']

In [38]:
freq_matrix = _create_frequency_matrix(sentences)

freq_matrix

{'woman education': {'woman': 2,
  'education': 2,
  'catch': 1,
  'term': 1,
  'refers': 1,
  'state': 1,
  'tertiary': 1,
  'health': 1,
  'girl': 1},
 'million girl sc': {'million': 1,
  'girl': 1,
  'school': 1,
  'globe': 1,
  'majority': 1,
  'developing': 1,
  'country': 1},
 'country world d': {'country': 2,
  'world': 1,
  'developing': 1,
  'take': 1,
  'step': 1,
  'improve': 1,
  'condition': 1,
  'education': 1,
  'woman': 1,
  'play': 1,
  'role': 1,
  'nation': 1,
  'development': 1},
 'consider societ': {'consider': 1,
  'society': 1,
  'men': 1,
  'stem': 1,
  'support': 1,
  'tree': 1,
  'face': 1,
  'element': 1,
  'woman': 1,
  'root': 1},
 'root tree sprea': {'root': 1,
  'tree': 1,
  'spreading': 1,
  'branch': 1,
  'sheltering': 1,
  'protecting': 1,
  'needy': 1},
 'woman soul soci': {'woman': 2,
  'soul': 1,
  'society': 2,
  'judged': 1,
  'way': 1,
  'treated': 1},
 'educated man go': {'educated': 2,
  'man': 1,
  'go': 2,
  'make': 2,
  'society': 1,
  'woma

In [39]:
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)

tf_matrix

{'woman education': {'woman': 0.2222222222222222,
  'education': 0.2222222222222222,
  'catch': 0.1111111111111111,
  'term': 0.1111111111111111,
  'refers': 0.1111111111111111,
  'state': 0.1111111111111111,
  'tertiary': 0.1111111111111111,
  'health': 0.1111111111111111,
  'girl': 0.1111111111111111},
 'million girl sc': {'million': 0.14285714285714285,
  'girl': 0.14285714285714285,
  'school': 0.14285714285714285,
  'globe': 0.14285714285714285,
  'majority': 0.14285714285714285,
  'developing': 0.14285714285714285,
  'country': 0.14285714285714285},
 'country world d': {'country': 0.15384615384615385,
  'world': 0.07692307692307693,
  'developing': 0.07692307692307693,
  'take': 0.07692307692307693,
  'step': 0.07692307692307693,
  'improve': 0.07692307692307693,
  'condition': 0.07692307692307693,
  'education': 0.07692307692307693,
  'woman': 0.07692307692307693,
  'play': 0.07692307692307693,
  'role': 0.07692307692307693,
  'nation': 0.07692307692307693,
  'development': 0.07

In [40]:
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)

count_doc_per_words

{'woman': 10,
 'education': 6,
 'catch': 1,
 'term': 1,
 'refers': 1,
 'state': 1,
 'tertiary': 1,
 'health': 1,
 'girl': 3,
 'million': 1,
 'school': 1,
 'globe': 1,
 'majority': 1,
 'developing': 2,
 'country': 2,
 'world': 1,
 'take': 1,
 'step': 1,
 'improve': 1,
 'condition': 1,
 'play': 2,
 'role': 2,
 'nation': 1,
 'development': 2,
 'consider': 1,
 'society': 6,
 'men': 2,
 'stem': 1,
 'support': 1,
 'tree': 2,
 'face': 1,
 'element': 1,
 'root': 2,
 'spreading': 1,
 'branch': 1,
 'sheltering': 1,
 'protecting': 1,
 'needy': 1,
 'soul': 1,
 'judged': 1,
 'way': 1,
 'treated': 2,
 'educated': 3,
 'man': 1,
 'go': 1,
 'make': 3,
 'stay': 1,
 'home': 1,
 'house': 1,
 'occupant': 1,
 'mother': 2,
 'wife': 1,
 'sister': 1,
 'care': 1,
 'taker': 1,
 'nurse': 1,
 'etc': 1,
 'need': 1,
 'others': 1,
 'understanding': 1,
 'structure': 1,
 'child': 1,
 'weigh': 1,
 'boy': 1,
 'history': 1,
 'evidence': 1,
 'grew': 1,
 'mistake': 1,
 'leave': 1,
 'goal': 1,
 'achieved': 1,
 'gender': 1,
 

In [41]:
import math

total_documents = len(sentences)
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)

idf_matrix

{'woman education': {'woman': 0.17609125905568124,
  'education': 0.3979400086720376,
  'catch': 1.1760912590556813,
  'term': 1.1760912590556813,
  'refers': 1.1760912590556813,
  'state': 1.1760912590556813,
  'tertiary': 1.1760912590556813,
  'health': 1.1760912590556813,
  'girl': 0.6989700043360189},
 'million girl sc': {'million': 1.1760912590556813,
  'girl': 0.6989700043360189,
  'school': 1.1760912590556813,
  'globe': 1.1760912590556813,
  'majority': 1.1760912590556813,
  'developing': 0.8750612633917001,
  'country': 0.8750612633917001},
 'country world d': {'country': 0.8750612633917001,
  'world': 1.1760912590556813,
  'developing': 0.8750612633917001,
  'take': 1.1760912590556813,
  'step': 1.1760912590556813,
  'improve': 1.1760912590556813,
  'condition': 1.1760912590556813,
  'education': 0.3979400086720376,
  'woman': 0.17609125905568124,
  'play': 0.8750612633917001,
  'role': 0.8750612633917001,
  'nation': 1.1760912590556813,
  'development': 0.8750612633917001},


In [42]:
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'woman education': {'woman': 0.0391313909012625, 'education': 0.08843111303823058, 'catch': 0.13067680656174235, 'term': 0.13067680656174235, 'refers': 0.13067680656174235, 'state': 0.13067680656174235, 'tertiary': 0.13067680656174235, 'health': 0.13067680656174235, 'girl': 0.0776633338151132}, 'million girl sc': {'million': 0.16801303700795447, 'girl': 0.0998528577622884, 'school': 0.16801303700795447, 'globe': 0.16801303700795447, 'majority': 0.16801303700795447, 'developing': 0.1250087519131, 'country': 0.1250087519131}, 'country world d': {'country': 0.13462480975256924, 'world': 0.09046855838889857, 'developing': 0.06731240487628462, 'take': 0.09046855838889857, 'step': 0.09046855838889857, 'improve': 0.09046855838889857, 'condition': 0.09046855838889857, 'education': 0.030610769897849048, 'woman': 0.013545481465821635, 'play': 0.06731240487628462, 'role': 0.06731240487628462, 'nation': 0.09046855838889857, 'development': 0.06731240487628462}, 'consider societ': {'consider': 0.11

In [43]:
sentence_scores = _score_sentences(tf_idf_matrix)

In [44]:
tf_idf_matrix['woman education']

{'woman': 0.0391313909012625,
 'education': 0.08843111303823058,
 'catch': 0.13067680656174235,
 'term': 0.13067680656174235,
 'refers': 0.13067680656174235,
 'state': 0.13067680656174235,
 'tertiary': 0.13067680656174235,
 'health': 0.13067680656174235,
 'girl': 0.0776633338151132}

In [45]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [46]:
sentences

['woman education catch term refers state tertiary health education girl woman',
 'million girl school globe majority developing country',
 'country world developing country take step improve condition education woman play role nation development',
 'consider society men stem support tree face element woman root',
 'root tree spreading branch sheltering protecting needy',
 'woman soul society society judged way woman treated',
 'educated man go make society educated woman go stay home make house occupant',
 'woman play role society mother wife sister care taker nurse etc',
 'need others understanding structure',
 'educated mother make child educated weigh education girl child boy',
 'history evidence society woman treated men educated grew',
 'mistake leave woman goal development achieved gender allowed opportunity education area',
 'education make woman become right raise voice exploitation violence',
 'society cannot progress woman weep',
 'weapon education carve path family']

In [47]:
# Sentence score per senetnce 

# Sentence score = (Sum of TFIDF value for each word in senetence) / total words in sentence

sentence_scores

{'woman education': 0.10992074190278446,
 'million girl sc': 0.14598892994575804,
 'country world d': 0.07621861776575153,
 'consider societ': 0.09079671353181226,
 'root tree sprea': 0.1557260984094246,
 'woman soul soci': 0.15420548822261615,
 'educated man go': 0.11602550098461564,
 'woman play role': 0.0847583686961728,
 'need others und': 0.29402281476392034,
 'educated mother': 0.13708197383797716,
 'history evidenc': 0.10236558712522159,
 'mistake leave w': 0.08973407110384192,
 'education make ': 0.10283393612836818,
 'society cannot ': 0.1828883616149426,
 'weapon educatio': 0.20409220179579055}

In [48]:
# Sum of all the Sentence score / Total sentences

threshold = _find_average_score(sentence_scores)

threshold

0.13644396038859988

In [49]:
# Check each senetcnes and its Sentence score and if it Greater than the Threshold (1.3 * Threshould) select the sentence

def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [50]:
_generate_summary(sentences,sentence_scores,1.3* threshold)

' need others understanding structure society cannot progress woman weep weapon education carve path family'