# Importing Libraries

In [1]:
from tkinter import *
from tkinter.font import Font
from tkinter.ttk import Combobox
from tkinter import filedialog
from PIL import ImageTk,Image
from tkinter import messagebox
import openpyxl as op
import os
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from nltk.tokenize import RegexpTokenizer
from tkinter import scrolledtext
import PyPDF2
from sklearn.decomposition import LatentDirichletAllocation

nlp = spacy.load('en_core_web_lg')

# Tkinter window

In [2]:
root=Tk()
root.minsize(1366,750)

root.title('Text Summarisation')

frame_font = Font(size=10, family='OpenSymbol', weight='bold')

var=IntVar()

# Background Image

In [3]:
img = Image.open(r"D:\Pictures\841139.jpg")
new_img=img.resize((1366,750))

canvas=Canvas(root)
canvas.place(relwidth=1,relheight=1)   

photo=ImageTk.PhotoImage(new_img)
canvas.create_image(0,0,image=photo,anchor=NW)



1

# Function  to display the file name 
* This function creates a button to upload PDF file
* After uploading the file, this function returns file name and displays it as label just above the "Upload PDF" button
* "count" is a counter used to check whether this function is called or not
* If the function is called more than once(or the "Upload PDF" is clicked more the once), it destroys the previous label of file name, hence preventing over-writing of labels

In [4]:
count = 0
def pdf_file():
    global file_label,count,frame2_filename_path
    if count == 1:                       #so that it destroys the previous label when we click "upload file" again
        file_label.destroy()
    frame2_filename_path = filedialog.askopenfilename(initialdir='D:/', title="Select PDF",filetypes=(("PDF files", "*.pdf"),("All files","*.*")))
    file_label = Label(frame3, text='\t'+os.path.basename(frame2_filename_path))
    file_label.grid(row=0, column=1, sticky=W)
    count = 1
    

# Function to display the potential topics and the summary 
## Topic modeling
* The text data entered via. text or PDF are converted to  Document Term Matrix(DTM)
* Which is then passed to LDA to retrieve most important topics/words from the text data(top 10)


## Summary
* We remove all the punctuations by extracting only "alpha-numeric characters(\w+)" and "full-stops(for sentences)" using RegexpTokenzer and saving it as a new string
* Then applying tfidfvectorizer for DTM which removes the stop words as well
* With all the words having it's tf-idf values we calculate the tf-idf scores (by summing up the tf-idf values of words in a sentence) of all the sentences and consider only those sentences having high scores for summary (35% sentences of the total number of sentences)
* Then we arrange the sentences as per the sequence in the original text data

In [5]:
def summary_window(data):
    # Topic Modeling------------------------------------------------------------------------------------------------
    tfidf = TfidfVectorizer(stop_words='english')
    vec = tfidf.fit_transform([data])
    LDA = LatentDirichletAllocation(n_components=1,random_state=42)
    LDA.fit(vec)
    single_topic = LDA.components_
    top_word_indices = single_topic.argsort()[0][-10:]
    
    
    topic_window = Toplevel(root)
    topic_window.geometry("1366x750") 
   
    #1st frame -----------------------------------------------------------------------------------------------------
    frame1=LabelFrame(topic_window,font=frame_font,padx=272,pady=20)
    frame1.grid(row=0,column=0)
    txt_dta = Label(frame1, text='Potential Topics: ',font = ("Times",20),fg='red')
    txt_dta.grid(row=0, column=0, sticky=W)
    topic = Label(frame1,text=[tfidf.get_feature_names()[index] for index in top_word_indices],font = ("Times",17))
    topic.grid(row=0, column=1, sticky=W)
    
    #Summary--------------------------------------------------------------------------------------------------------
    s=' '

    tokenizer = RegexpTokenizer(r'\w+\.*')
    new_text = s.join(tokenizer.tokenize(data))
    new_text = nlp(new_text)
    
    tfidf = TfidfVectorizer(stop_words='english')
    vec = tfidf.fit_transform([new_text.text])
    
    sentence_tokens = [sent for sent in new_text.sents]
    feature_names = tfidf.get_feature_names()
    sentence_scores = {}

    for sent in sentence_tokens:
        score=0
        for word in sent:
            if word.text in feature_names:
                score = score + vec.toarray()[0][feature_names.index(word.text)]
                sentence_scores[sent] = score
    
    from heapq import nlargest
    select_length = int(len(sentence_tokens)*0.35)
    
    summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)
    summary = list(summary)
    
    #Arranging sentences
    sorted_summary=[]
    for i in sentence_tokens:
        if i in summary:
            sorted_summary.append(i.text)
            
        
 
    #2nd frame -----------------------------------------------------------------------------------------------------
    frame2=LabelFrame(topic_window,font=frame_font,padx=217,pady=20)
    frame2.grid(row=1,column=0)
    sum_lbl = Label(frame2, text='Summary:',font = ("Times",20),fg='red')
    sum_lbl.grid(row=0, column=0, sticky=W,ipadx=1) 
    
    summary_data = scrolledtext.ScrolledText(frame2,width=100,font = ("Times New Roman",14)) 
    for sent in sorted_summary:
        summary_data.insert(INSERT,'* ' + str(sent)+'\n\n')
    summary_data.grid(row=1,column=0)
    

    topic_window.mainloop() 
    
    
    

# Function to retrieve text data from the text box

In [6]:
def text_summary():
    text_data = text.get("1.0",END)
    summary_window(text_data)
    


# Function to read text from the PDF

In [7]:
def pdf_summary():
    file = open(frame2_filename_path,'rb')
    pdf_reader = PyPDF2.PdfFileReader(file)
    
    final_string = ''
    for i in range(0,pdf_reader.numPages):
        page = pdf_reader.getPage(i)
        page_text = page.extractText()
        final_string = final_string + page_text
   
    summary_window(final_string)

# Function to get summary 

In [8]:
def get_summary():        
    text1 = text.get("1.0",END)
    if (len(text1) > 1):               #for some reason even an empty textbox has a length of 1, hence ">1"   
        text_summary()
    elif count > 0:
        pdf_summary()
    else:
        messagebox.showerror("Error","Please enter Text or Upload a PDF")        
        
        

# Function to clear the text box and PDF 

In [9]:
def clear():
    if count > 0:
        file_label.destroy()   
    text1 = text.get("1.0",END)
    if (len(text1) > 0):           #to check if user has entered any text or not("isalpha()" is not working for some reason)
        text.delete("1.0",END)

# Tkinter main window

In [10]:
#1st frame ---------------------------------------------------------------------------------------------------------------------------------------
frame1=LabelFrame(root,font=frame_font,padx=65,pady=15)
frame1.grid(row=0,column=0)

txt_data = Label(frame1, text='Enter text')
txt_data.grid(row=0, column=0, sticky=W)
    
text=Text(frame1,height=4,width=25)
text.grid(row=0,column=1)

In [11]:
#2nd frame ---------------------------------------------------------------------------------------------------------------------------------------
frame2=LabelFrame(root,font=frame_font,padx=184,pady=15)
frame2.grid(row=1,column=0)

or_lbl = Label(frame2, text='OR')
or_lbl.grid(row=0, column=0)

In [12]:
#3rd frame ---------------------------------------------------------------------------------------------------------------------------------------
frame3=LabelFrame(root,font=frame_font,padx=78,pady=15)
frame3.grid(row=2,column=0)

get_file=Button(frame3,text='Upload PDF',command=pdf_file)
get_file.grid(row=1,column=1,ipadx=60,pady=10,padx=20)

In [13]:
#4th frame ---------------------------------------------------------------------------------------------------------------------------------------
frame4=LabelFrame(root,font=frame_font,padx=13,pady=15)
frame4.grid(row=3,column=0)

get_sum=Button(frame4,text='Get Summary',command=get_summary)
get_sum.grid(row=0,column=1,ipadx=5,pady=10,padx=56)

clear_pdf=Button(frame4,text='Clear',command=clear)
clear_pdf.grid(row=0,column=2,ipadx=5,pady=10,padx=56)

In [14]:
root.mainloop()