In [1]:
import tika
from tika import unpack
import spacy
import pandas as pd

# 1. Extracting text 

In [2]:
# Extracting text using tika-python
parsed = unpack.from_file('Sample_document.pdf', 'http://localhost:9998/')
text = parsed["content"]
print(text[:211])


Science

Science (from the Latin word scientia, meaning
"knowledge")[1] is a systematic enterprise that builds and
organizes knowledge in the form of testable explanations
and predictions about the universe.[2]


# 2. Processing the text using spaCy

In [3]:
# Loading the SpaCy 'small' model
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)  # applying the spaCy NLP pipeline

# Obtaining entity information
ent_text=[]
ent_label=[]
ent_sentence=[]
for entity in doc.ents:
    ent_text.append(entity.text)  # the entity's text
    ent_label.append(entity.label_)  # the type of entity (PERSON, ORG, GPE, etc)
    ent_sentence.append(entity.sent.text.replace('\n',' '))  # some context of the entity

len(ent_label),len(ent_text),len(ent_sentence),len(doc.ents)

(2774, 2774, 2774, 2774)

# 3. Exporting results to an excel file

In [4]:
#Saving the information as a dataframe
df=pd.DataFrame({'Entity': ent_text,'Type': ent_label,'Context':ent_sentence})
df.head()

Unnamed: 0,Entity,Type,Context
0,Latin,NORP,Science Science (from the Latin word scienti...
1,Egypt,GPE,The earliest roots of science can be traced to...
2,Mesopotamia,PRODUCT,The earliest roots of science can be traced to...
3,Greek,NORP,"Their contributions to mathematics, astronomy,..."
4,Empire,GPE,"After the fall of the Western Roman Empire, kn..."


In [5]:
entities = list(df.groupby('Type'))  # a list of (type, dataframe) tuples.
with pd.ExcelWriter('text-mining.xlsx') as writer:
    for ent_name, data in entities:
        data.to_excel(writer, sheet_name=ent_name, index=False)