# New approach, find lines with correct format (with thanks to David Kane at https://github.com/drkane)

Find correct sheets by finding those titled "Balance Sheet"

Exclude a bunch of false positives by filtering out "Statement of" and "Note to the financial statements"

Detect aligned elements by whitespace

within concatenated elements, detect values by regex

if needed, complete multi-line label by finding correct aggregated sentence in blocks / pars

In [1]:
import numpy as np
import pandas as pd
import sys
import os
import pytesseract                            # API for letting python interface with Google's tesseract OCR software
import re
import importlib

import xbrl_image_parser as xip



## 1. Get the filenames of the example data for my convenience

In [2]:
# Get a list of all of the pdf files in the directory "CH_records"
files = [filename.split(".")[0] for filename in os.listdir("./working/ocr_output_compiled") if ".csv" in filename]

In [3]:
for each in range(len(files)):
    print(each, files[each])

0 00002404
1 868273
2 1983517
3 2765595
4 03293902
5 02959325
6 00542515
7 01539777
8 02714555
9 00030177
10 04802747
11 02266230
12 00983951
13 01002610
14 01804186
15 5508774
16 02430955
17 00053475
18 02245999
19 00553535
20 3387163
21 01337451
22 3459907
23 00178090
24 00468115
25 01369166
26 00782931
27 09457025
28 983951
29 01370175
30 06005142
31 04860660
32 2303730
33 02582534
34 00477955
35 04558828
36 06034603
37 3824626


## 2. Read in a csv file of data extracted from a PDF

In [5]:
# Rediscovering what works...
importlib.reload(xip)
index=0

# So far can create all those extra geometric features, can convert to numeric
test = pd.read_csv("./working/ocr_output_compiled/"+files[index]+".csv")

In [9]:
# Create numeric vars where possible
test['numerical'] = xip.convert_to_numeric(test['text'])

# Do some geometry (eg; calculate bottom-bounding coordinates explicitly)
test = xip.make_measurements(test)

# Create a table of aggregated, multiline sentences
agg_text = xip.aggregate_sentences_over_lines(test)

In [12]:
# Look at the first page of the balance sheet.  Some things to note; that first element is a bounding box
# for the entire page
test[test['csv_num']==14][['top', 'bottom', 'text', 'level', 'conf']]

Unnamed: 0,top,bottom,text,level,conf
4614,0,3509,,1,-1
4615,303,342,,2,-1
4616,303,342,,3,-1
4617,303,342,,4,-1
4618,303,340,Southampton,5,93
4619,304,333,Isle,5,94
4620,304,333,of,5,96
4621,304,341,Wight,5,95
4622,304,333,and,5,96
4623,304,333,South,5,96


## 3. Identify balance sheet pages

## 4.  Find lines on the balance sheet

In [19]:
# first; the line detection algorithm
# Lifted this almost directly from David Kane's work
def detect_lines(page_df, x_tolerance=0):
    """
    Detect lines in the csv of a page, returned by Tesseract
    """
    words_df = page_df[page_df['word_num'] > 0]
    page_stats = page_df.iloc[0, :]
    
    row_ranges = []
    this_range = []
    
    # Iterate through every vertical pixel position, top (0) to bottom (height)
    for i in range(page_stats['height']):
        result = (( words_df['bottom'] >= i ) & ( words_df['top'] <= i )).sum() > 0
        
        # Append vertical pixels aligned with words to this_range
        if result:
            this_range.append(i)
        
        # If we've passed out of an "occupied" range, append the resulting range to a list to store
        else:
            if this_range:
                row_ranges.append(this_range)
            this_range = []
        
    # Create bounding boxes for convenience
    return[{"left":0, "right":page_stats['width'], "top":min(r), "bottom":max(r)} for r in row_ranges]

In [188]:
def extract_lines(page_df, lines):
    
    # Look, dark magic!
    finance_regex = r'(.*)\s+(\(?\-?[\,0-9]+\)?)\s+(\(?\-?[\,0-9]+\)?)$'
    
    words_df = page_df[page_df['word_num'] > 0]
    
    results = pd.DataFrame()
    for line in lines:
        
        # Retrieve all text in line
        inline = (words_df['bottom'] <= line['bottom']) & (words_df['top'] >= line['top'])
        line_text = " ".join( words_df[inline]['text'] )
        
        # Perform an incredibly complex regex search to extract right-most two numbers and the label
        result = re.match(finance_regex, line_text)
        
        if result:
            results = results.append({"label":re.sub("[0-9]", "", result.groups()[0]).strip(),
                                      "CurrYr":result.groups()[1],
                                      "LastYr":result.groups()[2],
                                      "source":line_text},
                                     ignore_index=True)
    
    return(results)

In [189]:
lines = extract_lines(page_df, detect_lines(page_df))
lines

Unnamed: 0,CurrYr,LastYr,label,source
0,180,137,Intangible assets,Intangible assets 12 180 137
1,28271,28681,Tangible assets,"Tangible assets 13 28,271 28,681"
2,446,405,Stocks,Stocks 15 446 405
3,23174,11926,Debtors: amounts falling due within one year,Debtors: amounts falling due within one year 1...
4,10208,10565,Cash at bank and in hand,"Cash at bank and in hand 10,208 10,565"
5,"(10,936)","(7,527)",Creditors: amounts falling due within one year,Creditors: amounts falling due within one year...
6,22892,15369,Net current assets,"Net current assets 22,892 15,369"
7,51346,44190,Total assets less current liabilities,"Total assets less current liabilities 51,346 4..."
8,"(2,620)","(3,071)",than one year,"than one year 18 (2,620) (3,071)"
9,(18),(18),Pensions,Pensions (18) (18)


In [None]:
# Next steps;
# make sure to add in missing label text from multi-line labels