In [1]:
from tika import unpack
import spacy
import pandas as pd

# 1. Extracting text 

In [2]:
# Extracting text using tika-python
parsed = unpack.from_file('Sample_document.pdf', 'http://localhost:9998/')
text = parsed["content"]
print(text[:211])


Science

Science (from the Latin word scientia, meaning
"knowledge")[1] is a systematic enterprise that builds and
organizes knowledge in the form of testable explanations
and predictions about the universe.[2]


# 2. Processing the text using spaCy

In [3]:
# Loading the spaCy 'small' model
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)  # applying the spaCy NLP pipeline

# Obtaining entity information
ent_text = []
ent_labels = []
ent_sentences = []
for entity in doc.ents:
    ent_text.append(entity.text)  # the entity's text
    ent_labels.append(entity.label_)  # the type of entity (PERSON, ORG, GPE, etc)
    ent_sentences.append(entity.sent.text.replace('\n',' '))  # some context of the entity

len(ent_labels),len(ent_text),len(ent_sentences),len(doc.ents)

(2696, 2696, 2696, 2696)

# 3. Exporting results to an excel file

In [4]:
#Saving the information as a dataframe
text_data = pd.DataFrame({'Entity': ent_text,'Type': ent_labels,'Context':ent_sentences})
text_data.head()

Unnamed: 0,Entity,Type,Context
0,Latin,NORP,Science Science (from the Latin word scienti...
1,universe.[2][3][4,PRODUCT,"""knowledge"")[1] is a systematic enterprise tha..."
2,Ancient Egypt\n,PERSON,The earliest roots of science can be traced to...
3,Mesopotamia,LOC,The earliest roots of science can be traced to...
4,around 3500 to 3000,CARDINAL,The earliest roots of science can be traced to...


In [5]:
# Getting brief descriptions of each entity type

labels = text_data.Type.unique()
ent_descriptions = [spacy.explain(_label) for _label in labels]
Descriptions = pd.DataFrame({"Type": labels, "Description": ent_descriptions})
Descriptions

Unnamed: 0,Type,Description
0,NORP,Nationalities or religious or political groups
1,PRODUCT,"Objects, vehicles, foods, etc. (not services)"
2,PERSON,"People, including fictional"
3,LOC,"Non-GPE locations, mountain ranges, bodies of ..."
4,CARDINAL,Numerals that do not fall under another type
5,GPE,"Countries, cities, states"
6,DATE,Absolute or relative dates or periods
7,ORG,"Companies, agencies, institutions, etc."
8,ORDINAL,"""first"", ""second"", etc."
9,WORK_OF_ART,"Titles of books, songs, etc."


In [6]:
with pd.ExcelWriter('text-mining.xlsx') as writer:
    Descriptions.to_excel(writer, sheet_name="DEFINITIONS", index=False)
    for ent_name, data in text_data.groupby('Type'):
        data[["Entity","Context"]].to_excel(writer, sheet_name=ent_name, 
                                            index=False)