## Extract Meta Information from Documents

### Import packages and class

In [1]:
from os import listdir
from os.path import isfile, join
from tabulate import tabulate
from tika import parser 
import pprint

import meta_information_class

### Read filename of pdfs from directory and choose file

In [2]:
filenames = [f for f in listdir('data') if isfile(join('data', f))]

file_idx = 3
file = 'data/'+filenames[file_idx]
file

'data/04_SOP-Business-Plan.pdf'

### Apply file parser

In [3]:
raw = parser.from_file(file)
text = raw["content"]

### Define keyword list

In [4]:
keywords_names_list = ['procedure', 'process', 'sop', 'policy', 'manual', 'step']
keywords_documents_list = ['purchase', 'order', 'form', 'request', 'invoice', 'documents', 'document', 'documentation']
keywords_date_list = ['issued date','issue-date','effective-date', 'implementation-date','updated', 'adopted' ,'revised',"review date","revision date","version","last revision","issued","effective date","date"]

### Extract meta information from document

In [5]:
information = meta_information_class.extract_meta_information(file, keywords_names_list, keywords_date_list, keywords_documents_list)

information_dict = information.create_dict()
information_df = information.create_df()

### Output as dictionary

In [6]:
pprint.pprint(information_dict)

{'date': [('date', '2014-02-28')],
 'documents (bigrams)': [('projection form', 12),
                         ('form attachment', 5),
                         ('plan form', 3)],
 'documents (trigrams)': [('flow projection form', 4),
                          ('business plan form', 3),
                          ('cost projection form', 3)],
 'linked processes': ['8. RELATED DOCUMENTS ',
                      '8.1. Sanitation Business Plan Form ',
                      '8.2. Market Analysis Form ',
                      '8.3. Market Segmentation Form ',
                      '8.4. Competitor Analysis Form ',
                      '8.5. Promotion Form ',
                      '8.6. Cost of Goods Manufactured Form ',
                      '8.7. Production Capacity Form ',
                      '8.8. Sales Projection Form ',
                      '8.9. Operational Cost Projection Form ',
                      '8.10. Cash Flow Projection Form ',
                      '8.11. Profit and Loss P

### Output as dataframe

In [7]:
print(tabulate(information_df, headers='keys', tablefmt='psql', showindex=False))

+--------------------+---------------------------------------+---------+
| meta information   | value                                 |   count |
|--------------------+---------------------------------------+---------|
| name               | STANDARD OPERATING PROCEDURE          |       4 |
| date               | 2014-02-28                            |         |
| documents (2-gram) | projection form                       |      12 |
| documents (2-gram) | form attachment                       |       5 |
| documents (2-gram) | plan form                             |       3 |
| documents (3-gram) | flow projection form                  |       4 |
| documents (3-gram) | business plan form                    |       3 |
| documents (3-gram) | cost projection form                  |       3 |
| linked processes   | 8. RELATED DOCUMENTS                  |         |
| linked processes   | 8.1. Sanitation Business Plan Form    |         |
| linked processes   | 8.2. Market Analysis Form   