In [12]:
import pandas as pd
from difflib import SequenceMatcher
import os
import re
import fitz
import io
from PIL import Image, ImageDraw

In [13]:
DATABASE_FIELDS = ['year', 'tif_number', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
COMPLETION_STATUS_CSV = 'butter_json.csv'
PARSED_PDFS_DIR = '../parsed_pdfs'

In [14]:
# Load in completion status of files and their pages
completion_csv = pd.read_csv(COMPLETION_STATUS_CSV)

In [15]:
completion_csv.drop(labels=['id'], axis=1, inplace=True)

In [16]:
page_status = {}
for _, row in completion_csv.iterrows():

	key = f"{row['year']}_{row['tif_number']}"

	if key not in page_status:
		page_status[key] = { 'successful': [], 'failed': [] }
		page_status[key]['year'] = row['year']
		page_status[key]['tif_number'] = row['tif_number']

	if row['successful'] == 1:
		if isinstance(row['page_list'], str):
			page_status[key]['successful'] = row['page_list'].split(',')
	else:
		if isinstance(row['page_list'], str):
			page_status[key]['failed'] = row['page_list'].split(',')
	

In [17]:
# Check to make sure all of the entries in the page_status dictionary are
# actually in the parsed_pdfs folder
for pair in page_status:
	if not os.path.isfile(os.path.join(PARSED_PDFS_DIR, f'{pair}.csv')):
		print('AHHGHELKJOOOOOO NOOOOOO', pair)

In [18]:
# We want to loop through each year_tif pair and scan through all of the text in
# each document

# Realistically though we only want to see the top 500px worth of content.

# Perform the search query then when we get a match query the rest of the page

In [19]:
SIMILARITY_THRESHOLD = 0.7

def is_finance(query_vector, log=False):
	flags_to_watch = {
		# 'combined': False,
		'expenditures': False,
		'balance': False,
		'revenue': False,
		# 'schedule': False
	}

	# Loop over all text in the query bag and compare it to our dict
	for word in query_vector:
		for buzz_word in flags_to_watch:
			similar_ratio = SequenceMatcher(None, word.lower(), buzz_word.lower())

			if similar_ratio.ratio() > SIMILARITY_THRESHOLD:
				flags_to_watch[buzz_word] = True
				break
	if log:
		print(' '.join(query_vector))
		print(flags_to_watch)

	# If not all the flags are met return False
	return all(flag for flag in flags_to_watch.values())
	

In [20]:
IGNORE_STRING_1 = 'no tax increment project expenditures' # First seen 1997
IGNORE_STRING_2 = 'no tax increment expenditures within the project area' # First seen 1998_4
IGNORE_STRING_3 = 'no tax increment expenditures or cumulative deposits over' # First seen 2002_10

def is_ignored(query_vector):
	# Join all elements of vector
	doc_string = ' '.join(query_vector)
	doc_string = doc_string.lower()

	return (re.search(IGNORE_STRING_1, doc_string) is not None) or (re.search(IGNORE_STRING_2, doc_string) is not None) or (re.search(IGNORE_STRING_3, doc_string) is not None)

In [21]:
if not os.path.exists('rect_table_images'):
	os.makedirs('rect_table_images')

if not os.path.exists('table_images'):
	os.makedirs('table_images')

In [22]:

pages_we_skip = 0

SKIP_LIST = [
	'1998_29',	# This document seems to just be the same thing
				# repeated twice and just has estimated costs
	'1998_43',	# Same thing as 1998_29. They didn't include the report
	'2008_162',	# There is no report but the field is "increment.expenditures"
				# as opposed to "increment expenditures" so it doesn't get
				# caught in the ignore string
	'2010_132',	# Nothing seems to have happened in this TIF this year, but
				# it is not filled out in a conventional way
	'2010_143',	# ditto
	'2010_170',	# ditto ditto, information not present
	'2010_173', # ditto ditto ditto
	'2010_168', # ditto...
	'2010_171',	# ditto. No deposits >= 100_000
	'2011_159', # Nothing over 100_000,
	'2011_162',
	'2011_168',
	'2011_170',
	'2011_173',
	'2011_174',
	'2012_132',
	'2012_168',
	'2012_170',
	'2012_173',
	'2012_174',
	'2012_175',
	'2013_168',
	'2013_170',
	'2013_173',
	'2013_174',
	'2013_175',
	'2014_168',
	'2014_170',
	'2014_173',
	'2014_174',
	'2014_175',
	'2014_176',
	'2014_177',
	'2014_178',
	'2015_168',
	'2015_170',
	'2015_174',
	'2015_175',
	'2015_176',
	'2016_168',
	'2016_170',
	'2016_175',
	'2016_179',
	'2016_180',
	'2017_170',
	'2018_182',
	'2018_181',
	'2019_183',
	'2019_184',
	'2022_186',
	'2014_162',
]

MANUAL_CORRECTIONS = {
	'1998_31': 24,	# Line through top of page disrupts recognition
	'1998_37': 90,	# The word 'revenues' was not scraped from the pdf
	'1999_1': 11,	# ditto
	'2000_1': 10,	# Top section skipped
	'2000_3': 10,	# ditto
	'2000_2': 9,	# ditto ditto
	'2000_4': 10,	# ditto ditto ditto
	'1998_3': 18,	# Two matches. Second match is 1997 report
	'2007_2': 14,	# Two matches. Second is only Governmental funds
	'2007_4': 14,	# ditto
	'2007_1': 14,
	'2007_3': 14,
	'2007_6': 14,
}

HAND_FILLED = {
	'1998_44': 51,	# This report is a mess and frankly I'm not sure
					# if this is even right
	'1999_3': 23,	# The table was scanned, put in a field, then the
					# page was scanned
	'1999_4': 11,	# A lot of words got missed. Flagging this because
					# a lot of numbers were messed up during
					# pre-processing too
	'1999_6': 11,	# Top section was missed. Flagged because numbers bad
	'2020_182': 5,	# Someone at the council is just lazy. Not formatted
	'2010_162': 7,	# Similar to 2010_143 and 132. The information does
					# seem to be there though
	'2010_159': 7,	# ditto
	'2012_159': 7,	# No exchanges >= 100_000 but data still there
	'2012_162': 7,	# ditto
	'2013_159': 7,
	'2013_162': 7,
	'2014_159': 5,
	'2015_178': 6,
	'2019_182': 5,
	'2003_14': 120,	# So many matches. This appears to be in different
					# format from previous 2 pages though.
	'20016_14': [13, 14], # It spans 2 pages

}

finished_file = 'finished.txt'

finished_pairs = []


def find_stuff(pair):


	if pair in SKIP_LIST:
		return False

	if pair in HAND_FILLED:
		return False

	# Skip 1999-2001, these seem to be a different breed
	# if pair[0:4] in ['1999', '2000', '2001'] :
	# 	return False

	# Load in the associated TIF csv
	csv_path = os.path.join(PARSED_PDFS_DIR, f'{pair}.csv')
	tif_text = pd.read_csv(csv_path, header=None, names=DATABASE_FIELDS)

	# Only grab top 550 px
	top_section = tif_text[tif_text['top'] <= 550]

	# Sort all words so that they appear in order
	top_section.sort_values(['page_num', 'block_num', 'line_num', 'word_num'])

	# Get a list of all the pages that have words in the top section
	pages = top_section['page_num'].unique()

	if pair in MANUAL_CORRECTIONS:
		pages = [MANUAL_CORRECTIONS[pair], tif_text[tif_text['page_num'] == MANUAL_CORRECTIONS[pair]]]
		return pages

	matched_pages = []

	for page in pages:

		page_df = top_section[top_section['page_num'] == page]
		page_vector = page_df['text']

		if is_finance(page_vector.to_list()):
			matched_pages.append(page)

	# print(pair, matched_pages)

	# These all have exactly 2 matches (when not 0)
	if pair[0:4] in ['2007', '2008', '2009'] and len(matched_pages) == 2:
		matched_pages = [matched_pages[0]]
		print('Corrected', matched_pages)
		return [matched_pages[0], tif_text[tif_text['page_num'] == int(matched_pages[0])]]

	elif len(matched_pages) > 1:
		print('Too many pages')
		return False

	if len(matched_pages) > 0:
		return [matched_pages[0], tif_text[tif_text['page_num'] == int(matched_pages[0])]]

	resolved = False
	
	pages = page_status[pair]['successful']

	# Check the contents of every page and look for the string 'no city
	# contracts related to the project area'
	for page in pages:

		page_df = tif_text[tif_text['page_num'] == int(page)]
		page_vector = page_df['text']

		if is_ignored(page_vector.to_list()):
			print(f'gotta killer. Page {page}')
			if pair[0:4] in ['1999', '2000', '2001'] :
				continue
			resolved = True
			break

	if resolved:
		return False

	# The statement of revenues might be lower down on the page. Not all reports
	# actually follow the same format in a year so a document like this needs to
	# be flagged for manual review.
	top_section = tif_text[tif_text['top'] <= 750]

	# Sort all words so that they appear in order
	top_section.sort_values(['page_num', 'block_num', 'line_num', 'word_num'])

	# Get a list of all the pages that have words in the top section
	pages = top_section['page_num'].unique()

	for page in pages:

		page_df = top_section[top_section['page_num'] == page]
		page_vector = page_df['text']

		if is_finance(page_vector.to_list()):
			matched_pages.append(page)

	if len(matched_pages) > 1:
		print('hey too many!')

	if len(matched_pages) > 0:
		resolved = True
		print(pair, matched_pages)
		return [matched_pages[0], tif_text[tif_text['page_num'] == int(matched_pages[0])]]
	
	if pair[0:4] in ['1999', '2000', '2001'] :
		return False

	if not resolved:
		print(pages)

		print(pair)

		print("Couldn't resolve")


for pair in page_status:

	print(pair)
	if os.path.exists(os.path.join('table_images', f'{pair}.png')):
		continue

	poi = find_stuff(pair)
	# print(pair, 'found', poi)


	if poi is False:
		continue

	print(poi[0])


	# Load pdf
	with fitz.open(os.path.join('TIFpdfs', f'{pair}.pdf')) as pdf:
		
		page_num = poi[0]
		
		page = pdf.load_page(int(page_num))


		pixmap = page.get_pixmap(dpi=300)
		image = Image.frombytes('RGB', [pixmap.width, pixmap.height], pixmap.samples)

		image.save(os.path.join('rect_table_images', f'{pair}.png'), jpg_quality=100, format="PNG")
		
		image_bytes = io.BytesIO()

		rect_image = ImageDraw.Draw(image)

		if len(poi) != 2:
			print(f'{pair} didn\'t return correctly')
			continue

		for _, text in poi[1].iterrows():
			rect = (text['left'] - 5, text['top'] - 5, text['left'] + text['width'] + 5, text['top'] + text['height'] + 5)
			rect_image.rectangle(rect, outline='red', width=4)

		image.save(os.path.join('table_images', f'{pair}.png'), jpg_quality=100, format="PNG")

1997_42
1997_1
1997_2
1997_3
1997_4
1997_5
gotta killer. Page 11
1997_6
gotta killer. Page 11
1997_7
gotta killer. Page 11
1997_8
1997_9
1997_10
gotta killer. Page 10
1997_11
gotta killer. Page 10
1997_12
gotta killer. Page 11
1997_15
1997_17
1997_19
1997_13
1997_20
1997_23
1997_25
1997_14
1997_22
1997_26
1997_27
1997_29
1997_28
1997_30
gotta killer. Page 10
1997_32
1997_36
1997_37
1997_40
1997_39
1997_35
gotta killer. Page 10
1997_41
1997_45
1997_46
1997_47
1997_43
1997_44
1998_1
1998_2
1998_4
1998_6
gotta killer. Page 12
1998_7
gotta killer. Page 12
1998_5
gotta killer. Page 9
1998_8
1998_10
gotta killer. Page 12
1998_9
1998_11
1998_12
gotta killer. Page 12
1998_13
gotta killer. Page 12
1998_15
gotta killer. Page 12
1998_16
1998_19
1998_20
1998_22
gotta killer. Page 12
1998_14
1998_23
gotta killer. Page 12
1998_25
1998_24
gotta killer. Page 19
1998_26
gotta killer. Page 12
1998_28
gotta killer. Page 12
1998_27
gotta killer. Page 34
1998_29
1998_31
1998_34
gotta killer. Page 11
1998_3

In [23]:
pages_we_skip

0