# New approach, find lines with correct format (with thanks to David Kane at https://github.com/drkane)

Find correct sheets by finding those titled "Balance Sheet"

Exclude a bunch of false positives by filtering out "Statement of" and "Note to the financial statements"

Detect aligned elements by whitespace

within concatenated elements, detect values by regex

if needed, complete multi-line label by finding correct aggregated sentence in blocks / pars

In [1]:
import numpy as np
import pandas as pd
import os
import re
import importlib

import xbrl_image_parser as xip

## 0.  Example usage, calling the master function from the library
The rest of the notebook goes through details of what it's doing...

In [6]:
test = pd.read_csv("./working/ocr_output_compiled/00002404.csv")
results = xip.process_OCR_csv(test)
results

Unnamed: 0,CurrYr,LastYr,label,source
0,180,137,Intangible assets,Intangible assets 12 180 137
1,28271,28681,Tangible assets,"Tangible assets 13 28,271 28,681"
2,446,405,Stocks,Stocks 15 446 405
3,23174,11926,Debtors: amounts falling due within one year,Debtors: amounts falling due within one year 1...
4,10208,10565,Cash at bank and in hand,"Cash at bank and in hand 10,208 10,565"
5,"(10,936)","(7,527)",Creditors: amounts falling due within one year,Creditors: amounts falling due within one year...
6,22892,15369,Net current assets,"Net current assets 22,892 15,369"
7,51346,44190,Total assets less current liabilities,"Total assets less current liabilities 51,346 4..."
8,"(2,620)","(3,071)","""Creditors: amounts falling due after more tha...","than one year 18 (2,620) (3,071)"
9,(18),(18),Pensions,Pensions (18) (18)


## 1. Get the filenames of the example data for my convenience

In [2]:
# Get a list of all of the pdf files in the directory "CH_records"
files = [filename.split(".")[0] for filename in os.listdir("./working/ocr_output_compiled") if ".csv" in filename]

In [3]:
for each in range(len(files)):
    print(each, files[each])

0 00002404
1 868273
2 1983517
3 2765595
4 03293902
5 02959325
6 00542515
7 01539777
8 02714555
9 00030177
10 04802747
11 02266230
12 00983951
13 01002610
14 01804186
15 5508774
16 02430955
17 00053475
18 02245999
19 00553535
20 3387163
21 01337451
22 3459907
23 00178090
24 00468115
25 01369166
26 00782931
27 09457025
28 983951
29 01370175
30 06005142
31 04860660
32 2303730
33 02582534
34 00477955
35 04558828
36 06034603
37 3824626


## 2. Read in a csv file of data extracted from a PDF

In [4]:
# Rediscovering what works...
index=0

# So far can create all those extra geometric features, can convert to numeric
test = pd.read_csv("./working/ocr_output_compiled/"+files[index]+".csv")

In [None]:
# Create numeric vars where possible
test['numerical'] = xip.convert_to_numeric(test['text'])

# Do some geometry (eg; calculate bottom-bounding coordinates explicitly)
test = xip.make_measurements(test)

In [None]:
# Look at the first page of the balance sheet.  Some things to note; that first element is a bounding box
# for the entire page
test[test['csv_num']==14][['top', 'bottom', 'text', 'level', 'conf']]

## 3. Identify balance sheet pages

In [None]:
csv_numbers = xip.find_balance_sheet_pages(test)
csv_numbers

## 4.  Find lines on the balance sheet

In [None]:
results = pd.DataFrame()

for csv_number in csv_numbers:
    page_df = test[test['csv_num']==csv_number]

    results = results.append( xip.extract_lines(page_df, xip.detect_lines(page_df)) )

In [None]:
results

In [None]:
print(files[index])

## 5. Quick-fixes for weird problems

These'll no doubt accumulate

In [None]:
# a)  Drop any field labelled "Note", it's been read from the headers by accident
results = results[results['label'].isin(["Notes", "notes", "note", "Note"]) == False]