In [85]:
# Read text file
file = 'C:\MyWork\MyLearning\Career Growth\ML\Files\DataSet\SampleText.txt'

file = open(file , 'r')
text = file.read()

# Print the data
text

'Women education is a catch all term which refers to the state of primary, secondary, tertiary and health education in girls and women. There are 65 Million girls out of school across the globe; majority of them are in the developing and underdeveloped countries. All the countries of the world, especially the developing and underdeveloped countries must take necessary steps to improve their condition of female education; as women can play a vital role in the nation’s development.\nIf we consider society as tree, then men are like its strong main stem which supports the tree to face the elements and women are like its roots; most important of them all. The stronger the roots are the bigger and stronger the tree will be spreading its branches; sheltering and protecting the needy.\nWomen are the soul of a society; a society can well be judged by the way its women are treated. An educated man goes out to make the society better, while an educated woman; whether she goes out or stays at hom

In [86]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)
total_documents = len(sentences)

sentences

['Women education is a catch all term which refers to the state of primary, secondary, tertiary and health education in girls and women.',
 'There are 65 Million girls out of school across the globe; majority of them are in the developing and underdeveloped countries.',
 'All the countries of the world, especially the developing and underdeveloped countries must take necessary steps to improve their condition of female education; as women can play a vital role in the nation’s development.',
 'If we consider society as tree, then men are like its strong main stem which supports the tree to face the elements and women are like its roots; most important of them all.',
 'The stronger the roots are the bigger and stronger the tree will be spreading its branches; sheltering and protecting the needy.',
 'Women are the soul of a society; a society can well be judged by the way its women are treated.',
 'An educated man goes out to make the society better, while an educated woman; whether she g

In [87]:
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, PorterStemmer

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            #word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [88]:
_create_frequency_matrix(sentences)

{'Women education': {'women': 2,
  'education': 2,
  'catch': 1,
  'term': 1,
  'refers': 1,
  'state': 1,
  'primary': 1,
  ',': 2,
  'secondary': 1,
  'tertiary': 1,
  'health': 1,
  'girls': 1,
  '.': 1},
 'There are 65 Mi': {'65': 1,
  'million': 1,
  'girls': 1,
  'school': 1,
  'across': 1,
  'globe': 1,
  ';': 1,
  'majority': 1,
  'developing': 1,
  'underdeveloped': 1,
  'countries': 1,
  '.': 1},
 'All the countri': {'countries': 2,
  'world': 1,
  ',': 1,
  'especially': 1,
  'developing': 1,
  'underdeveloped': 1,
  'must': 1,
  'take': 1,
  'necessary': 1,
  'steps': 1,
  'improve': 1,
  'condition': 1,
  'female': 1,
  'education': 1,
  ';': 1,
  'women': 1,
  'play': 1,
  'vital': 1,
  'role': 1,
  'nation': 1,
  '’': 1,
  'development': 1,
  '.': 1},
 'If we consider ': {'consider': 1,
  'society': 1,
  'tree': 2,
  ',': 1,
  'men': 1,
  'like': 2,
  'strong': 1,
  'main': 1,
  'stem': 1,
  'supports': 1,
  'face': 1,
  'elements': 1,
  'women': 1,
  'roots': 1,
  ';': 

In [89]:

def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [90]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [91]:
import math

def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table
        
    return idf_matrix

In [92]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [93]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [211]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [94]:
freq_matrix = _create_frequency_matrix(sentences)

freq_matrix

{'Women education': {'women': 2,
  'education': 2,
  'catch': 1,
  'term': 1,
  'refers': 1,
  'state': 1,
  'primary': 1,
  ',': 2,
  'secondary': 1,
  'tertiary': 1,
  'health': 1,
  'girls': 1,
  '.': 1},
 'There are 65 Mi': {'65': 1,
  'million': 1,
  'girls': 1,
  'school': 1,
  'across': 1,
  'globe': 1,
  ';': 1,
  'majority': 1,
  'developing': 1,
  'underdeveloped': 1,
  'countries': 1,
  '.': 1},
 'All the countri': {'countries': 2,
  'world': 1,
  ',': 1,
  'especially': 1,
  'developing': 1,
  'underdeveloped': 1,
  'must': 1,
  'take': 1,
  'necessary': 1,
  'steps': 1,
  'improve': 1,
  'condition': 1,
  'female': 1,
  'education': 1,
  ';': 1,
  'women': 1,
  'play': 1,
  'vital': 1,
  'role': 1,
  'nation': 1,
  '’': 1,
  'development': 1,
  '.': 1},
 'If we consider ': {'consider': 1,
  'society': 1,
  'tree': 2,
  ',': 1,
  'men': 1,
  'like': 2,
  'strong': 1,
  'main': 1,
  'stem': 1,
  'supports': 1,
  'face': 1,
  'elements': 1,
  'women': 1,
  'roots': 1,
  ';': 

In [95]:
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
print(len(tf_matrix))

15


In [96]:
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)

count_doc_per_words

{'women': 9,
 'education': 6,
 'catch': 1,
 'term': 1,
 'refers': 1,
 'state': 1,
 'primary': 1,
 ',': 8,
 'secondary': 1,
 'tertiary': 1,
 'health': 1,
 'girls': 2,
 '.': 15,
 '65': 1,
 'million': 1,
 'school': 1,
 'across': 1,
 'globe': 1,
 ';': 8,
 'majority': 1,
 'developing': 2,
 'underdeveloped': 2,
 'countries': 2,
 'world': 1,
 'especially': 1,
 'must': 1,
 'take': 1,
 'necessary': 1,
 'steps': 1,
 'improve': 1,
 'condition': 1,
 'female': 1,
 'play': 2,
 'vital': 1,
 'role': 1,
 'nation': 1,
 '’': 1,
 'development': 2,
 'consider': 1,
 'society': 4,
 'tree': 2,
 'men': 2,
 'like': 1,
 'strong': 1,
 'main': 1,
 'stem': 1,
 'supports': 1,
 'face': 1,
 'elements': 1,
 'roots': 2,
 'important': 1,
 'stronger': 1,
 'bigger': 1,
 'spreading': 1,
 'branches': 1,
 'sheltering': 1,
 'protecting': 1,
 'needy': 1,
 'soul': 1,
 'well': 3,
 'judged': 1,
 'way': 1,
 'treated': 2,
 'educated': 3,
 'man': 1,
 'goes': 1,
 'make': 2,
 'better': 2,
 'woman': 1,
 'whether': 1,
 'stays': 1,
 'home

In [97]:
import math
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)

idf_matrix

{'Women education': {'women': 0.2218487496163564,
  'education': 0.3979400086720376,
  'catch': 1.1760912590556813,
  'term': 1.1760912590556813,
  'refers': 1.1760912590556813,
  'state': 1.1760912590556813,
  'primary': 1.1760912590556813,
  ',': 0.27300127206373764,
  'secondary': 1.1760912590556813,
  'tertiary': 1.1760912590556813,
  'health': 1.1760912590556813,
  'girls': 0.8750612633917001,
  '.': 0.0},
 'There are 65 Mi': {'65': 1.1760912590556813,
  'million': 1.1760912590556813,
  'girls': 0.8750612633917001,
  'school': 1.1760912590556813,
  'across': 1.1760912590556813,
  'globe': 1.1760912590556813,
  ';': 0.27300127206373764,
  'majority': 1.1760912590556813,
  'developing': 0.8750612633917001,
  'underdeveloped': 0.8750612633917001,
  'countries': 0.8750612633917001,
  '.': 0.0},
 'All the countri': {'countries': 0.8750612633917001,
  'world': 1.1760912590556813,
  ',': 0.27300127206373764,
  'especially': 1.1760912590556813,
  'developing': 0.8750612633917001,
  'under

In [98]:
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'Women education': {'women': 0.03413057686405483, 'education': 0.061221539795698096, 'catch': 0.09046855838889857, 'term': 0.09046855838889857, 'refers': 0.09046855838889857, 'state': 0.09046855838889857, 'primary': 0.09046855838889857, ',': 0.04200019570211348, 'secondary': 0.09046855838889857, 'tertiary': 0.09046855838889857, 'health': 0.09046855838889857, 'girls': 0.06731240487628462, '.': 0.0}, 'There are 65 Mi': {'65': 0.09800760492130678, 'million': 0.09800760492130678, 'girls': 0.07292177194930834, 'school': 0.09800760492130678, 'across': 0.09800760492130678, 'globe': 0.09800760492130678, ';': 0.02275010600531147, 'majority': 0.09800760492130678, 'developing': 0.07292177194930834, 'underdeveloped': 0.07292177194930834, 'countries': 0.07292177194930834, '.': 0.0}, 'All the countri': {'countries': 0.07609228377319131, 'world': 0.05113440256763832, ',': 0.011869620524510332, 'especially': 0.05113440256763832, 'developing': 0.03804614188659566, 'underdeveloped': 0.03804614188659566

In [99]:
tf_idf_matrix['Women education']

{'women': 0.03413057686405483,
 'education': 0.061221539795698096,
 'catch': 0.09046855838889857,
 'term': 0.09046855838889857,
 'refers': 0.09046855838889857,
 'state': 0.09046855838889857,
 'primary': 0.09046855838889857,
 ',': 0.04200019570211348,
 'secondary': 0.09046855838889857,
 'tertiary': 0.09046855838889857,
 'health': 0.09046855838889857,
 'girls': 0.06731240487628462,
 '.': 0.0}

In [244]:
sentences = ['woman education catch term refers state tertiary health education girl woman',
 'million girl school globe majority developing country',
 'country world developing country take step improve condition education woman play role nation development',
 'consider society men stem support tree face element woman root',
 'root tree spreading branch sheltering protecting needy',
 'woman soul society society judged way woman treated',
 'educated man go make society educated woman go stay home make house occupant',
 'woman play role society mother wife sister care taker nurse etc',
 'need others understanding structure',
 'educated mother make child educated weigh education girl child boy',
 'history evidence society woman treated men educated grew',
 'mistake leave woman goal development achieved gender allowed opportunity education area',
 'education make woman become right raise voice exploitation violence',
 'society cannot progress woman weep',
 'weapon education carve path family']

In [245]:
freq_matrix = _create_frequency_matrix(sentences)

freq_matrix

{'woman education': {'woman': 2,
  'education': 2,
  'catch': 1,
  'term': 1,
  'refers': 1,
  'state': 1,
  'tertiary': 1,
  'health': 1,
  'girl': 1},
 'million girl sc': {'million': 1,
  'girl': 1,
  'school': 1,
  'globe': 1,
  'majority': 1,
  'developing': 1,
  'country': 1},
 'country world d': {'country': 2,
  'world': 1,
  'developing': 1,
  'take': 1,
  'step': 1,
  'improve': 1,
  'condition': 1,
  'education': 1,
  'woman': 1,
  'play': 1,
  'role': 1,
  'nation': 1,
  'development': 1},
 'consider societ': {'consider': 1,
  'society': 1,
  'men': 1,
  'stem': 1,
  'support': 1,
  'tree': 1,
  'face': 1,
  'element': 1,
  'woman': 1,
  'root': 1},
 'root tree sprea': {'root': 1,
  'tree': 1,
  'spreading': 1,
  'branch': 1,
  'sheltering': 1,
  'protecting': 1,
  'needy': 1},
 'woman soul soci': {'woman': 2,
  'soul': 1,
  'society': 2,
  'judged': 1,
  'way': 1,
  'treated': 1},
 'educated man go': {'educated': 2,
  'man': 1,
  'go': 2,
  'make': 2,
  'society': 1,
  'woma

In [246]:
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)

In [247]:
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)

count_doc_per_words

{'woman': 10,
 'education': 6,
 'catch': 1,
 'term': 1,
 'refers': 1,
 'state': 1,
 'tertiary': 1,
 'health': 1,
 'girl': 3,
 'million': 1,
 'school': 1,
 'globe': 1,
 'majority': 1,
 'developing': 2,
 'country': 2,
 'world': 1,
 'take': 1,
 'step': 1,
 'improve': 1,
 'condition': 1,
 'play': 2,
 'role': 2,
 'nation': 1,
 'development': 2,
 'consider': 1,
 'society': 6,
 'men': 2,
 'stem': 1,
 'support': 1,
 'tree': 2,
 'face': 1,
 'element': 1,
 'root': 2,
 'spreading': 1,
 'branch': 1,
 'sheltering': 1,
 'protecting': 1,
 'needy': 1,
 'soul': 1,
 'judged': 1,
 'way': 1,
 'treated': 2,
 'educated': 3,
 'man': 1,
 'go': 1,
 'make': 3,
 'stay': 1,
 'home': 1,
 'house': 1,
 'occupant': 1,
 'mother': 2,
 'wife': 1,
 'sister': 1,
 'care': 1,
 'taker': 1,
 'nurse': 1,
 'etc': 1,
 'need': 1,
 'others': 1,
 'understanding': 1,
 'structure': 1,
 'child': 1,
 'weigh': 1,
 'boy': 1,
 'history': 1,
 'evidence': 1,
 'grew': 1,
 'mistake': 1,
 'leave': 1,
 'goal': 1,
 'achieved': 1,
 'gender': 1,
 

In [248]:
import math
idf_matrix = _create_idaf_matrix(freq_matrix, count_doc_per_words, total_documents)

idf_matrix

{'woman education': {'woman': 0.17609125905568124,
  'education': 0.3979400086720376,
  'catch': 1.1760912590556813,
  'term': 1.1760912590556813,
  'refers': 1.1760912590556813,
  'state': 1.1760912590556813,
  'tertiary': 1.1760912590556813,
  'health': 1.1760912590556813,
  'girl': 0.6989700043360189},
 'million girl sc': {'million': 1.1760912590556813,
  'girl': 0.6989700043360189,
  'school': 1.1760912590556813,
  'globe': 1.1760912590556813,
  'majority': 1.1760912590556813,
  'developing': 0.8750612633917001,
  'country': 0.8750612633917001},
 'country world d': {'country': 0.8750612633917001,
  'world': 1.1760912590556813,
  'developing': 0.8750612633917001,
  'take': 1.1760912590556813,
  'step': 1.1760912590556813,
  'improve': 1.1760912590556813,
  'condition': 1.1760912590556813,
  'education': 0.3979400086720376,
  'woman': 0.17609125905568124,
  'play': 0.8750612633917001,
  'role': 0.8750612633917001,
  'nation': 1.1760912590556813,
  'development': 0.8750612633917001},


In [249]:
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'woman education': {'woman': 0.0391313909012625, 'education': 0.08843111303823058, 'catch': 0.13067680656174235, 'term': 0.13067680656174235, 'refers': 0.13067680656174235, 'state': 0.13067680656174235, 'tertiary': 0.13067680656174235, 'health': 0.13067680656174235, 'girl': 0.0776633338151132}, 'million girl sc': {'million': 0.16801303700795447, 'girl': 0.0998528577622884, 'school': 0.16801303700795447, 'globe': 0.16801303700795447, 'majority': 0.16801303700795447, 'developing': 0.1250087519131, 'country': 0.1250087519131}, 'country world d': {'country': 0.13462480975256924, 'world': 0.09046855838889857, 'developing': 0.06731240487628462, 'take': 0.09046855838889857, 'step': 0.09046855838889857, 'improve': 0.09046855838889857, 'condition': 0.09046855838889857, 'education': 0.030610769897849048, 'woman': 0.013545481465821635, 'play': 0.06731240487628462, 'role': 0.06731240487628462, 'nation': 0.09046855838889857, 'development': 0.06731240487628462}, 'consider societ': {'consider': 0.11

In [250]:
sentence_scores = _score_sentences(tf_idf_matrix)

sentence_scores

{'woman education': 0.10992074190278446,
 'million girl sc': 0.14598892994575804,
 'country world d': 0.07621861776575153,
 'consider societ': 0.09079671353181226,
 'root tree sprea': 0.1557260984094246,
 'woman soul soci': 0.15420548822261615,
 'educated man go': 0.11602550098461564,
 'woman play role': 0.0847583686961728,
 'need others und': 0.29402281476392034,
 'educated mother': 0.13708197383797716,
 'history evidenc': 0.10236558712522159,
 'mistake leave w': 0.08973407110384192,
 'education make ': 0.10283393612836818,
 'society cannot ': 0.1828883616149426,
 'weapon educatio': 0.20409220179579055}

In [251]:
threshold = _find_average_score(sentence_scores)

threshold

0.13644396038859988

In [252]:
senetnce = 'woman education'

print("Sentence:",sentences[0])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: woman education catch term refers state tertiary health education girl woman


tf_idf Score for words in sentence:
{'woman': 0.0391313909012625, 'education': 0.08843111303823058, 'catch': 0.13067680656174235, 'term': 0.13067680656174235, 'refers': 0.13067680656174235, 'state': 0.13067680656174235, 'tertiary': 0.13067680656174235, 'health': 0.13067680656174235, 'girl': 0.0776633338151132}


Number of words in the sentences: 9


sentence_score: 0.10992074190278446


In [253]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [254]:
sentences

['woman education catch term refers state tertiary health education girl woman',
 'million girl school globe majority developing country',
 'country world developing country take step improve condition education woman play role nation development',
 'consider society men stem support tree face element woman root',
 'root tree spreading branch sheltering protecting needy',
 'woman soul society society judged way woman treated',
 'educated man go make society educated woman go stay home make house occupant',
 'woman play role society mother wife sister care taker nurse etc',
 'need others understanding structure',
 'educated mother make child educated weigh education girl child boy',
 'history evidence society woman treated men educated grew',
 'mistake leave woman goal development achieved gender allowed opportunity education area',
 'education make woman become right raise voice exploitation violence',
 'society cannot progress woman weep',
 'weapon education carve path family']

In [255]:
sentence_scores

{'woman education': 0.10992074190278446,
 'million girl sc': 0.14598892994575804,
 'country world d': 0.07621861776575153,
 'consider societ': 0.09079671353181226,
 'root tree sprea': 0.1557260984094246,
 'woman soul soci': 0.15420548822261615,
 'educated man go': 0.11602550098461564,
 'woman play role': 0.0847583686961728,
 'need others und': 0.29402281476392034,
 'educated mother': 0.13708197383797716,
 'history evidenc': 0.10236558712522159,
 'mistake leave w': 0.08973407110384192,
 'education make ': 0.10283393612836818,
 'society cannot ': 0.1828883616149426,
 'weapon educatio': 0.20409220179579055}

In [256]:
threshold

0.13644396038859988

In [257]:
sentences

['woman education catch term refers state tertiary health education girl woman',
 'million girl school globe majority developing country',
 'country world developing country take step improve condition education woman play role nation development',
 'consider society men stem support tree face element woman root',
 'root tree spreading branch sheltering protecting needy',
 'woman soul society society judged way woman treated',
 'educated man go make society educated woman go stay home make house occupant',
 'woman play role society mother wife sister care taker nurse etc',
 'need others understanding structure',
 'educated mother make child educated weigh education girl child boy',
 'history evidence society woman treated men educated grew',
 'mistake leave woman goal development achieved gender allowed opportunity education area',
 'education make woman become right raise voice exploitation violence',
 'society cannot progress woman weep',
 'weapon education carve path family']

In [268]:
sentence_scores['woman education']

0.10992074190278446

In [288]:
summary = ''

for s in sentences:
    print(s)
    sentence_scores[s[:15]] >= 1.3 * threshold
    summary += " " + s
    print("generate Summary:")
    print(summary)

woman education catch term refers state tertiary health education girl woman
generate Summary:
 woman education catch term refers state tertiary health education girl woman
million girl school globe majority developing country
generate Summary:
 woman education catch term refers state tertiary health education girl woman million girl school globe majority developing country
country world developing country take step improve condition education woman play role nation development
generate Summary:
 woman education catch term refers state tertiary health education girl woman million girl school globe majority developing country country world developing country take step improve condition education woman play role nation development
consider society men stem support tree face element woman root
generate Summary:
 woman education catch term refers state tertiary health education girl woman million girl school globe majority developing country country world developing country take step impro

In [284]:

    print(sentence[:15])
    if sentence[:15] in sentence_scores[sentence[:15]] >= (1.3 * threshold):
        
        summary += " " + sentence

woman education


TypeError: argument of type 'float' is not iterable

In [259]:
sentence_scores[:15]

TypeError: unhashable type: 'slice'

In [199]:
senetnce = 'million girl sc'

print("Sentence:",sentences[1])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: million girl school globe majority developing country


tf_idf Score for words in sentence:
{'million': 0.16801303700795447, 'girl': 0.0998528577622884, 'school': 0.16801303700795447, 'globe': 0.16801303700795447, 'majority': 0.16801303700795447, 'developing': 0.1250087519131, 'country': 0.1250087519131}


Number of words in the sentences: 7


sentence_score: 0.14598892994575804


In [200]:
# tf_idf_matrix['country world d']

senetnce = 'country world d'

print("Sentence:",sentences[2])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: country world developing country take step improve condition education woman play role nation development


tf_idf Score for words in sentence:
{'country': 0.13462480975256924, 'world': 0.09046855838889857, 'developing': 0.06731240487628462, 'take': 0.09046855838889857, 'step': 0.09046855838889857, 'improve': 0.09046855838889857, 'condition': 0.09046855838889857, 'education': 0.030610769897849048, 'woman': 0.013545481465821635, 'play': 0.06731240487628462, 'role': 0.06731240487628462, 'nation': 0.09046855838889857, 'development': 0.06731240487628462}


Number of words in the sentences: 13


sentence_score: 0.07621861776575153


In [204]:
senetnce = 'consider societ'

print("Sentence:",sentences[3])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: consider society men stem support tree face element woman root


tf_idf Score for words in sentence:
{'consider': 0.11760912590556814, 'society': 0.03979400086720376, 'men': 0.08750612633917002, 'stem': 0.11760912590556814, 'support': 0.11760912590556814, 'tree': 0.08750612633917002, 'face': 0.11760912590556814, 'element': 0.11760912590556814, 'woman': 0.017609125905568124, 'root': 0.08750612633917002}


Number of words in the sentences: 10


sentence_score: 0.09079671353181226


In [205]:
senetnce = 'root tree sprea'

print("Sentence:",sentences[4])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: root tree spreading branch sheltering protecting needy


tf_idf Score for words in sentence:
{'root': 0.1250087519131, 'tree': 0.1250087519131, 'spreading': 0.16801303700795447, 'branch': 0.16801303700795447, 'sheltering': 0.16801303700795447, 'protecting': 0.16801303700795447, 'needy': 0.16801303700795447}


Number of words in the sentences: 7


sentence_score: 0.1557260984094246


In [207]:
senetnce = 'woman soul soci'

print("Sentence:",sentences[5])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: woman soul society society judged way woman treated


tf_idf Score for words in sentence:
{'woman': 0.058697086351893746, 'soul': 0.19601520984261356, 'society': 0.13264666955734586, 'judged': 0.19601520984261356, 'way': 0.19601520984261356, 'treated': 0.14584354389861667}


Number of words in the sentences: 6


sentence_score: 0.15420548822261615


In [208]:
senetnce = 'educated man go'

print("Sentence:",sentences[6])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: educated man go make society educated woman go stay home make house occupant


tf_idf Score for words in sentence:
{'educated': 0.13979400086720378, 'man': 0.11760912590556814, 'go': 0.2352182518111363, 'make': 0.13979400086720378, 'society': 0.03979400086720376, 'woman': 0.017609125905568124, 'stay': 0.11760912590556814, 'home': 0.11760912590556814, 'house': 0.11760912590556814, 'occupant': 0.11760912590556814}


Number of words in the sentences: 10


sentence_score: 0.11602550098461564


In [127]:
tf_idf_matrix['woman play role']

{'woman education': {'woman': 0.0391313909012625,
  'education': 0.08843111303823058,
  'catch': 0.13067680656174235,
  'term': 0.13067680656174235,
  'refers': 0.13067680656174235,
  'state': 0.13067680656174235,
  'tertiary': 0.13067680656174235,
  'health': 0.13067680656174235,
  'girl': 0.0776633338151132},
 'million girl sc': {'million': 0.16801303700795447,
  'girl': 0.0998528577622884,
  'school': 0.16801303700795447,
  'globe': 0.16801303700795447,
  'majority': 0.16801303700795447,
  'developing': 0.1250087519131,
  'country': 0.1250087519131},
 'country world d': {'country': 0.13462480975256924,
  'world': 0.09046855838889857,
  'developing': 0.06731240487628462,
  'take': 0.09046855838889857,
  'step': 0.09046855838889857,
  'improve': 0.09046855838889857,
  'condition': 0.09046855838889857,
  'education': 0.030610769897849048,
  'woman': 0.013545481465821635,
  'play': 0.06731240487628462,
  'role': 0.06731240487628462,
  'nation': 0.09046855838889857,
  'development': 0.06

In [128]:
tf_idf_matrix['need others und']

{'need': 0.29402281476392034,
 'others': 0.29402281476392034,
 'understanding': 0.29402281476392034,
 'structure': 0.29402281476392034}

In [131]:
tf_idf_matrix['educated mother']

{'educated': 0.17474250108400471,
 'mother': 0.10938265792396251,
 'make': 0.08737125054200236,
 'child': 0.29402281476392034,
 'weigh': 0.14701140738196017,
 'education': 0.0497425010840047,
 'girl': 0.08737125054200236,
 'boy': 0.14701140738196017}

In [130]:
tf_idf_matrix['history evidenc']

{'history': 0.14701140738196017,
 'evidence': 0.14701140738196017,
 'society': 0.0497425010840047,
 'woman': 0.022011407381960155,
 'treated': 0.10938265792396251,
 'men': 0.10938265792396251,
 'educated': 0.08737125054200236,
 'grew': 0.14701140738196017}

In [142]:
tf_idf_matrix['mistake leave w']

{'mistake': 0.10691738718688013,
 'leave': 0.10691738718688013,
 'woman': 0.016008296277789203,
 'goal': 0.10691738718688013,
 'development': 0.07955102394470001,
 'achieved': 0.10691738718688013,
 'gender': 0.10691738718688013,
 'allowed': 0.10691738718688013,
 'opportunity': 0.10691738718688013,
 'education': 0.03617636442473069,
 'area': 0.10691738718688013}

In [143]:
tf_idf_matrix['education make ']

{'education': 0.04421555651911529,
 'make': 0.0776633338151132,
 'woman': 0.01956569545063125,
 'become': 0.13067680656174235,
 'right': 0.13067680656174235,
 'raise': 0.13067680656174235,
 'voice': 0.13067680656174235,
 'exploitation': 0.13067680656174235,
 'violence': 0.13067680656174235}

In [144]:
tf_idf_matrix['society cannot ']

{'society': 0.0994850021680094,
 'progress': 0.29402281476392034,
 'woman': 0.04402281476392031,
 'weep': 0.29402281476392034}

In [145]:
tf_idf_matrix['weapon educatio']

{'weapon': 0.2352182518111363,
 'education': 0.07958800173440753,
 'carve': 0.2352182518111363,
 'path': 0.2352182518111363,
 'family': 0.2352182518111363}

In [195]:
senetnce = 'woman education'

print("Sentence:",sentences[0])

print("\n")

print("tf_idf Score for words in sentence:")
print(tf_idf_matrix[senetnce])

tfidf_words_in_Sentences = list(tf_idf_matrix[senetnce].values())

number_of_words_sentences = len(tfidf_words_in_Sentences)

sentence_score = sum(tfidf_words_in_Sentences) / number_of_words_sentences

print("\n")

print("Number of words in the sentences:",number_of_words_sentences)

print("\n")
print("sentence_score:",sentence_score)

Sentence: woman education catch term refers state tertiary health education girl woman


tf_idf Score for words in sentence:
{'woman': 0.0391313909012625, 'education': 0.08843111303823058, 'catch': 0.13067680656174235, 'term': 0.13067680656174235, 'refers': 0.13067680656174235, 'state': 0.13067680656174235, 'tertiary': 0.13067680656174235, 'health': 0.13067680656174235, 'girl': 0.0776633338151132}


Number of words in the sentences: 9


sentence_score: 0.10992074190278446


In [194]:
# _score_sentences = 

# Total sum of ididf value for each word and total divided by number of words in the sentence


sum([x for x in xx]) / 9

0.10992074190278446

In [172]:
number_of_words_sentences

[0.0391313909012625,
 0.08843111303823058,
 0.13067680656174235,
 0.13067680656174235,
 0.13067680656174235,
 0.13067680656174235,
 0.13067680656174235,
 0.13067680656174235,
 0.0776633338151132]

In [171]:
sum(tfidf_words_in_Sentences) / 9

0.10992074190278446