In [1]:
import os
import re
import pandas as pd
import json

In [2]:
# function to process the extracted financial value (if extracted)
def process_value(value):
    if(value[0]=="{" or value[-1]=="}"):
        return value
    value = value.replace(",", "")
    if(len(value)!=0):
        if(value[0] == "("): 
            value = value.replace("(", "-")
        if(value[-1] == ")"):
            value = value.replace(")", "")
    return value

In [15]:
def extract_financial_data(file_path):
    
#EXTRACTING LINES FROM THE TEXT FILE.

    line_array=[]
    f = open(file_path, "r")
    for line in f.readlines():
        line_array.append((" ").join(line.split()))

#FIND THE STARTING AND ENDING INDEX OF FINANCIAL DATA AND EXTRACTING THE LINES CONTAINING FINANCIAL DATA..
    
    # most of the files have financial data confined between two lines having "dd Month YYYY" string.
    bounding_regex = re.compile(r'(\d(\d)?\s+)(\w+)\s+\d\d\d\d$') 
    found=False
    bounding_string=''
    for line in line_array:
        match = bounding_regex.search(line)
        if match and found == False:
            bounding_string=match.group()
            found=True

    financial_data=line_array
    if(bounding_string!=''):
        bounding_index=[line_array.index(l) for l in line_array if bounding_string in l] # index of lines containing the bounding string.
        # if bounding string is present at the starting index 
        if(len(bounding_index)==1):
            financial_data=financial_data[bounding_index[0]+1:]
        # if we are able to find the starting and ending index using bounding string.
        if(len(bounding_index)>1):
            financial_data = financial_data[bounding_index[0]+1:bounding_index[1]]
        for line in financial_data:
            # lines having "STATEMENTS" or "INFORMATION" or "NOTES TO THE ACCOUNTS" can act as the ending index for the financial data.
            if("statement" in line.lower() or "information" in line.lower() or "notes to the accounts" in line.lower()):
                financial_data = financial_data[:financial_data.index(line)]
                break
    
    # (optional-does not make any change to the output-just reduces the redundant data-specific to the problem) 
    if(len(financial_data)>15):
        financial_data=financial_data[:15]
    
# CODE TO FIND THE LINE - NOTES|NOTE (OPTIONAL) YEAR1 (OPTIONAL) YEAR2 to find the index of "2019" if it is there in the file

    regex = re.compile(r'((Note|Notes)\s)?((\d){4}\s)?(\d){4}$')
    # str_arr will be of the format [curr_year, prev_year] or [curr_year]
    str_arr=[]
    for line in financial_data:
        match = regex.search(line)
        if match:
            print(line)
            financial_data=financial_data[financial_data.index(line)+2:]
            str_arr=match.group().split()
            if(str_arr[0]=="Notes" or str_arr[0]=="Note"):
                str_arr.pop(0)
            break
            
    print(str_arr)
    print(financial_data)
    print()
    
#EXTRACT THE FINAL DATA AND CREATE A DICTIONARY IF POSSIBLE.

    index=-1
    financial_record={}
    
    # if str_arr is of the format [curr_year, prev_year], then financial data can be of the format:
    if(len(str_arr)==2):
        regex_1=re.compile(r'(.*)\s(\(?[\,\.0-9]+\)?|-)\s(\(?[\,\.0-9]+\)?|-)$') # 1) string Value1 Value2
        regex_2=re.compile(r'(\(?[\,0-9]+\)?|-)\s(\(?[\,0-9]+\)?|-)$') # 2) Value1 Value2
        regex_3=re.compile(r'(.*\s.*)$') # 3) string
        
    # if str_arr is of the format [curr_year], then financial data can be of the format:
    elif(len(str_arr)==1):
        regex_1=re.compile(r'(.*)\s(\(?[\,\.0-9]+\)?|-)$') # 1) string Value1
        regex_2=re.compile(r'(\(?[\,\.0-9]+\)?|-)$') # 2) Value1
        regex_3=re.compile(r'(.*\s.*)$') # 3) string
        
    if(len(str_arr)!=0):
        
        # find the index of "2019" if it is present in the file.
        if '2019' in str_arr:
            index=str_arr.index("2019")
            
        # extracting the label and its corresponding value (if it exists).
        for line in financial_data:
            label='nan'
            value='nan'
            
            if(regex_1.search(line)):
                match=regex_1.search(line)
                label=match.group(1).replace("Â£","&#163").replace("Â$","&#36").replace("Â€","&#8364")
                if(index!=-1):
                    value=process_value(match.group(index+2))
                    
            elif(regex_2.search(line)):
                match=regex_2.search(line)
                if(index!=-1):
                    value=process_value(match.group(index+1))
                    
            elif(regex_3.search(line)):
                match=regex_3.search(line)
                label=match.group(1).replace("Â£","&#163").replace("Â$","&#36").replace("Â€","&#8364")
                
            # appending the label and its value as key-value pair in a dictionary
            financial_record.update({str(label):str(value)})
    else:
        regex_1=re.compile(r'(.*)\s(\(?[\,\.0-9]+\)?|-)\s(\(?[\,\.0-9]+\)?|-)$')
       
    return json.dumps(financial_record)

In [25]:
def extract_balance_sheets_data(dir_path):
    file_names = os.listdir(dir_path)
    result=pd.DataFrame(columns=["Filename", "Extracted Values"])
    for file in file_names[:100]:
        print()
        print(file)
        extracted_dic = extract_financial_data(os.path.join(dir_path, file))
        print(extracted_dic)
        print()
        print("----------------------------------------------------------------------------------------------")

In [26]:
#-----------------------------------------------------------------------------------------------------

In [27]:
dir_path = ".\HCL ML Challenge Dataset"
extract_balance_sheets_data(dir_path)


X8XUG1JD.txt
Notes 2019 2018
['2019', '2018']
['Current Assets 312 1', 'Creditors: amounts falling due within one year (59) -', 'Net current assets (liabilities) 253 1', 'Total assets less current liabilities 253 1', 'Total net assets (liabilities) 253 1', 'Capital and reserves 253 1']

{"Current Assets": "312", "Creditors: amounts falling due within one year": "-59", "Net current assets (liabilities)": "253", "Total assets less current liabilities": "253", "Total net assets (liabilities)": "253", "Capital and reserves": "253"}

----------------------------------------------------------------------------------------------

X8XX000W.txt
2019 2018
['2019', '2018']
['Current assets', 'Cash at bank and in hand 1000 1000', 'Net assets 1000 1000', 'Issued share capital', '1000 Ordinary Shares 1 of Â£ each 1000 1000', 'Total Shareholder funds 1000 1000']

{"Current assets": "nan", "Cash at bank and in hand": "1000", "Net assets": "1000", "Issued share capital": "nan", "1000 Ordinary Shares 1

2020 2019
['2020', '2019']
['Called up share capital not paid 1 1', 'Net assets 1 1', 'Tssued share capital', '1 Ordinary Share 1 of Â£ each 1 1', 'Total Shareholder funds 1 1']

{"Called up share capital not paid": "1", "Net assets": "1", "Tssued share capital": "nan", "1 Ordinary Share 1 of &#163 each": "1", "Total Shareholder funds": "1"}

----------------------------------------------------------------------------------------------

X8XX0BU1.txt
2019 2018
['2019', '2018']
['Current assets', 'Cash at bank and in hand 100 100', 'Net assets 100 100', 'Issued share capital', '100 Ordinary Shares 1 of Â£ each 100 100', 'Total Shareholder funds 100 100']

{"Current assets": "nan", "Cash at bank and in hand": "100", "Net assets": "100", "Issued share capital": "nan", "100 Ordinary Shares 1 of &#163 each": "100", "Total Shareholder funds": "100"}

----------------------------------------------------------------------------------------------

X8XX0C57.txt
2020 2019
['2020', '2019']
['Called

2019 2018
['2019', '2018']
['Current assets', 'Cash at bank and in hand 1 1', 'Net assets 1 1', 'Issued share capital', '1 Ordinary Share 1 of Â£ each 1 1', 'Total Shareholder funds 1 1']

{"Current assets": "nan", "Cash at bank and in hand": "1", "Net assets": "1", "Issued share capital": "nan", "1 Ordinary Share 1 of &#163 each": "1", "Total Shareholder funds": "1"}

----------------------------------------------------------------------------------------------

X8XZIE17.txt
2019 2018
['2019', '2018']
['Called up share capital not paid 100 100', 'Net assets 100 100', 'Tssued share capital', '1 Ordinary Share of Â£ 100 each 100 100', 'Total Shareholder funds 100 100']

{"Called up share capital not paid": "100", "Net assets": "100", "Tssued share capital": "nan", "1 Ordinary Share of &#163 100 each": "100", "Total Shareholder funds": "100"}

----------------------------------------------------------------------------------------------

X8XZIEI1.txt
2019 2018
['2019', '2018']
['Fixed as

2020
['2020']
['Called up share capital not paid 1', 'Net assets 1', 'Tssued share capital', '1 Ordinary Share 1 of Â£ each 1', 'Total Shareholder funds 1']

{"Called up share capital not paid": "nan", "Net assets": "nan", "Tssued share capital": "nan", "1 Ordinary Share 1 of &#163 each": "nan", "Total Shareholder funds": "nan"}

----------------------------------------------------------------------------------------------

X8XZIN35.txt
2019 2018
['2019', '2018']
['Called up share capital not paid 2 2', 'Net assets 2 2', 'Tssued share capital', '2 Ordinary Shares 1 of Â£ each 2 2', 'Total Shareholder funds 2 2']

{"Called up share capital not paid": "2", "Net assets": "2", "Tssued share capital": "nan", "2 Ordinary Shares 1 of &#163 each": "2", "Total Shareholder funds": "2"}

----------------------------------------------------------------------------------------------

X8XZINN6.txt
2019 2018
['2019', '2018']
['Called up share capital not paid 1 1', 'Net assets 1 1', 'Tssued share cap

2019 2018
['2019', '2018']
['Called up share capital not paid 2 2', 'Net assets 2 2', 'Tssued share capital', '2 Ordinary Shares 1 of Â£ each 2 2', 'Total Shareholder funds 2 2']

{"Called up share capital not paid": "2", "Net assets": "2", "Tssued share capital": "nan", "2 Ordinary Shares 1 of &#163 each": "2", "Total Shareholder funds": "2"}

----------------------------------------------------------------------------------------------

X8XZIS4B.txt
Notes 2019
['2019']
['Fixed Assets 960', 'Current Assets 1,541', 'Creditors: amounts falling due within one year (1,209)', 'Net current assets (liabilities) 332', 'Total assets less current liabilities 1,292', 'Total net assets (liabilities) 1,292', 'Capital and reserves 1,292']

{"Fixed Assets": "960", "Current Assets": "1541", "Creditors: amounts falling due within one year": "-1209", "Net current assets (liabilities)": "332", "Total assets less current liabilities": "1292", "Total net assets (liabilities)": "1292", "Capital and reserve

2019 2018
['2019', '2018']
['Current assets', 'Cash at bank and in hand 2 2', 'Net assets 2 2', 'Issued share capital', '2 Ordinary Shares 1 of Â£ each 2 2', 'Total Shareholder funds 2 2']

{"Current assets": "nan", "Cash at bank and in hand": "2", "Net assets": "2", "Issued share capital": "nan", "2 Ordinary Shares 1 of &#163 each": "2", "Total Shareholder funds": "2"}

----------------------------------------------------------------------------------------------

X8Y28A2W.txt
Notes 2019 2018
['2019', '2018']
['Fixed Assets 11,779 13,397', 'Current Assets 43,205 18,499', 'Creditors: amounts falling due within one year (14,407) (22,226)', 'Net current assets (liabilities) 28,798 (3,727)', 'Total assets less current liabilities 40,577 9,670', 'Total net assets (liabilities) 40,577 9,670', 'Capital and reserves 40,577 9,670']

{"Fixed Assets": "11779", "Current Assets": "43205", "Creditors: amounts falling due within one year": "-14407", "Net current assets (liabilities)": "28798", "Tota