In [1]:
# !pip install nltk

In [2]:
# pip install customtkinter

In [3]:
# pip install wordcloud

In [4]:
from customtkinter import *
from PIL import Image

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\RITIK
[nltk_data]     NAYAK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\RITIK
[nltk_data]     NAYAK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [7]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [8]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [9]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [10]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [11]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [12]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [13]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))*0.8

    return average

In [14]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [15]:
def summ(input_string):
    sentences = sent_tokenize(input_string) # NLTK function
    total_documents = len(sentences)
    '''
    We already have a sentence tokenizer, so we just need
    to run the sent_tokenize() method to create the array of sentences.
    '''
    # 1 Sentence Tokenize
    sentences = sent_tokenize(input_string)
    total_documents = len(sentences)
    
    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    
    
    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    
    
    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    
    
    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    
    
    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    
    
    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    
    
    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    
    
    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)

    # print(sentences)
    # print(freq_matrix)
    # print(tf_matrix)
    # print(idf_matrix)
    # print(tf_idf_matrix)
    # print(sentence_scores)
    # print(summary)
    
    return summary

In [16]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [17]:
def process_input():
    input_text = text_box1.get("1.0", "end") 
    processed_text = summ(input_text)  
    text_box2.delete("1.0", "end")  
    text_box2.insert("1.0", processed_text)
    
    # Create and generate the word cloud
    wordcloud = WordCloud().generate(input_text)
    
    # Display the word cloud
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='nearest')
    plt.axis('off')
    plt.title("Word Cloud Visualization")
    plt.show()



In [18]:
# text = "The Middle East is on the brink of a devastating full-scale conflict as Iran and Israel engage in a dangerous game of retaliation. On Saturday, April 13, Iran launched a massive drone and missile attack on Israel, following an Israeli strike on its embassy compound in Syria. Iranian Foreign Minister Hossein Amirabdollahian claimed that Iran gave neighbouring countries and Israel's ally, the United States, 72 hours' notice before the attack. However, a senior official in US President Joe Biden's administration denied this, stating that Washington did not receive any such warning."

In [19]:
# summ("The Middle East is on the brink of a devastating full-scale conflict as Iran and Israel engage in a dangerous game of retaliation. On Saturday, April 13, Iran launched a massive drone and missile attack on Israel, following an Israeli strike on its embassy compound in Syria. Iranian Foreign Minister Hossein Amirabdollahian claimed that Iran gave neighbouring countries and Israel's ally, the United States, 72 hours' notice before the attack. However, a senior official in US President Joe Biden's administration denied this, stating that Washington did not receive any such warning.")

In [21]:
app = CTk()
app.geometry("600x650")
app.resizable(0,0)

side_img_data = Image.open("side-img.png")
txt_icon_data = Image.open("email-icon.png")

side_img = CTkImage(dark_image=side_img_data, light_image=side_img_data, size=(150, 650))
txt_icon = CTkImage(dark_image=txt_icon_data, light_image=txt_icon_data, size=(20,20))

CTkLabel(master=app, text="", image=side_img).pack(expand=True, side="left")

frame = CTkFrame(master=app, width=480, height=650, fg_color="#ffffff")
frame.pack_propagate(0)
frame.pack(expand=True, side="right")

CTkLabel(master=frame, text="Welcome To Text Summarizer!", text_color="#601E88", anchor="w", justify="left", font=("Arial Bold", 24)).pack(anchor="w", pady=(50, 5), padx=(25, 0))
CTkLabel(master=frame, text="Created by Ritik Nayak", text_color="#7E7E7E", anchor="w", justify="left", font=("Arial Bold", 12)).pack(anchor="w", padx=(25, 0))

CTkLabel(master=frame, text="  Text:", text_color="#601E88", anchor="w", justify="left", font=("Arial Bold", 14), image=txt_icon, compound="left").pack(anchor="w", pady=(38, 0), padx=(25, 0))
text_box1 = CTkTextbox(master=frame, width=400, height=200, fg_color="#EEEEEE", border_color="#601E88", border_width=1, text_color="#000000")
text_box1.pack(anchor="w", padx=(25, 0))

CTkLabel(master=frame, text="  Summary:", text_color="#601E88", anchor="w", justify="left", font=("Arial Bold", 14), compound="left").pack(anchor="w", pady=(21, 0), padx=(25, 0))
text_box2 = CTkTextbox(master=frame, width=400, height=100, fg_color="#EEEEEE", border_color="#601E88", border_width=1, text_color="#000000")
text_box2.pack(anchor="w", padx=(25, 0))

CTkButton(master=frame, text="click here!", fg_color="#601E88", hover_color="#E44982", font=("Arial Bold", 12), text_color="#ffffff", width=225, command = process_input).pack(anchor="center", pady=(40, 0), padx=(25, 0))


app.mainloop()