# Selecting A Text File

In [None]:
#using the Tkinter graphical user interface (GUI) to select text file
from tkinter import filedialog
from tkinter import *
import os

root = Tk()
cwd = os.getcwd() # get current working directory
root.filename = filedialog.askopenfilename(initialdir = cwd,title = "Select file",filetypes = (("all files","*.*"),("plain text","*.txt"),('pdf','*.pdf')))


# Extracting Text From The File

In [None]:
#extracting text from the file using Apache Tika
import tika
from tika import unpack
parsed = unpack.from_file(root.filename)
text=parsed["content"]   #text extracted from the file, as a continuous string


# Processing The Text Using SpaCy

In [None]:
import spacy
#loading the SpaCy 'small' model
nlp = spacy.load("en_core_web_sm")

#Processing the text
doc = nlp(text)

#obtaining entity information
ent_text=[]
ent_label=[]
ent_sentence=[]
for entity in doc.ents:
    ent_text.append(entity.text) #the entity's text
    ent_label.append(entity.label_) # the type of entity, a string e.g. PERSON,GPE, etc
    ent_sentence.append(entity.sent.text.replace('\n',' ')) # a sentence with the entity, as one line with newline - \n - removed

len(ent_label),len(ent_text),len(ent_sentence),len(doc.ents) # check number of entities + related info

# Exporting Extracted Information To A File

In [None]:
#Saving the information as a dataframe
import pandas as pd
df=pd.DataFrame({'Entity': ent_text,'Type': ent_label,'Context':ent_sentence})
df.head()

In [None]:
#Separating info for each entity. This gives a list of (label, label_dataframe) tuples.
entities=list(df.groupby('Type'))

#Exporting the entity info into an excel file, with a sheet for each type:
save_dir=filedialog.askdirectory(title='Select destination directory') # a GUI to select destination directory
with pd.ExcelWriter(save_dir+'/text-mining.xlsx') as writer:
    for x in entities:
        x[1].to_excel(writer, sheet_name=x[0], index=False)
