In [1]:
import pandas as pd
import json
from difflib import SequenceMatcher
import os

In [10]:
DATABASE_FIELDS = ['year', 'tif_number', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
COMPLETION_STATUS_CSV = 'butter_json.csv'
PARSED_PDFS_DIR = '../parsed_pdfs'

In [3]:
# Load in completion status of files and their pages
completion_csv = pd.read_csv(COMPLETION_STATUS_CSV)

In [4]:
completion_csv.drop(labels=['id'], axis=1, inplace=True)

In [9]:
page_status = {}
for _, row in completion_csv.iterrows():

	key = f"{row['year']}_{row['tif_number']}"

	if key not in page_status:
		page_status[key] = { 'successful': [], 'failed': [] }
		page_status[key]['year'] = row['year']
		page_status[key]['tif_number'] = row['tif_number']

	if row['successful'] == 1:
		if isinstance(row['page_list'], str):
			page_status[key]['successful'] = row['page_list'].split(',')
	else:
		if isinstance(row['page_list'], str):
			page_status[key]['failed'] = row['page_list'].split(',')
	

In [6]:
# Check to make sure all of the entries in the page_status dictionary are
# actually in the parsed_pdfs folder
for pair in page_status:
	if not os.path.isfile(os.path.join(PARSED_PDFS_DIR, f'{pair}.csv')):
		print('AHHGHELKJOOOOOO NOOOOOO', pair)

In [7]:
# We want to loop through each year_tif pair and scan through all of the text in
# each document

# Realistically though we only want to see the top 500px worth of content.

# Perform the search query then when we get a match query the rest of the page

In [20]:
FINANCE_LIST = ['EXPENDITURES', 'balance', 'revenue', "SCHEDULE", 'combined']
SIMILARITY_THRESHOLD = 0.7

def is_finance(query):
	for buzz_word in FINANCE_LIST:
		similar_ratio = SequenceMatcher(None, query.lower(), buzz_word.lower())

		if similar_ratio.ratio() > SIMILARITY_THRESHOLD:
			return True

	# If no match, return false
	return False

In [21]:
for pair in page_status:

	# Load in the associated TIF csv
	csv_path = os.path.join(PARSED_PDFS_DIR, f'{pair}.csv')
	tif_text = pd.read_csv(csv_path, header=None, names=DATABASE_FIELDS)

	matched_pages = []

	for _, row in tif_text[tif_text['top'] < 500].iterrows():
		if row['page_num'] not in matched_pages and is_finance(row['text']):
			matched_pages.append(row['page_num'])

	print(pair, matched_pages)

1997_42 []
1997_1 []
1997_2 []
1997_3 []
1997_4 []
1997_5 []
1997_6 []
1997_7 []
1997_8 []
1997_9 []
1997_10 []
1997_11 []
1997_12 []
1997_15 []
1997_17 []
1997_19 []
1997_13 []
1997_20 []
1997_23 []
1997_25 []
1997_14 []
1997_22 []
1997_26 []
1997_27 []
1997_29 []
1997_28 []
1997_30 []
1997_32 []
1997_36 []
1997_37 []
1997_40 []
1997_39 []
1997_35 []
1997_41 []
1997_45 []
1997_46 []
1997_47 []
1997_43 []
1997_44 []
1998_1 []
1998_2 []
1998_4 []
1998_6 []
1998_7 []
1998_5 []
1998_8 []
1998_10 []
1998_9 []
1998_11 []
1998_12 []
1998_13 []
1998_15 []
1998_16 []
1998_19 []
1998_20 []
1998_22 []
1998_14 []
1998_23 []
1998_25 []
1998_24 []
1998_26 []
1998_28 []
1998_27 []
1998_29 []
1998_31 []
1998_34 []
1998_36 []


KeyboardInterrupt: 