In [None]:
import pandas as pd
from difflib import SequenceMatcher
import os
import re
import fitz
import io
from PIL import Image, ImageDraw

In [None]:
DATABASE_FIELDS = ['year', 'tif_number', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
COMPLETION_STATUS_CSV = 'butter_json.csv'
PARSED_PDFS_DIR = '../parsed_pdfs'

In [None]:
# Load in completion status of files and their pages
completion_csv = pd.read_csv(COMPLETION_STATUS_CSV)

In [None]:
completion_csv.drop(labels=['id'], axis=1, inplace=True)

In [None]:
page_status = {}
for _, row in completion_csv.iterrows():

	key = f"{row['year']}_{row['tif_number']}"

	if key not in page_status:
		page_status[key] = { 'successful': [], 'failed': [] }
		page_status[key]['year'] = row['year']
		page_status[key]['tif_number'] = row['tif_number']

	if row['successful'] == 1:
		if isinstance(row['page_list'], str):
			page_status[key]['successful'] = row['page_list'].split(',')
	else:
		if isinstance(row['page_list'], str):
			page_status[key]['failed'] = row['page_list'].split(',')
	

In [None]:
# Check to make sure all of the entries in the page_status dictionary are
# actually in the parsed_pdfs folder
for pair in page_status:
	if not os.path.isfile(os.path.join(PARSED_PDFS_DIR, f'{pair}.csv')):
		print('AHHGHELKJOOOOOO NOOOOOO', pair)

In [None]:
# We want to loop through each year_tif pair and scan through all of the text in
# each document

# Realistically though we only want to see the top 500px worth of content.

# Perform the search query then when we get a match query the rest of the page

In [None]:
SIMILARITY_THRESHOLD = 0.7

def is_finance(query_vector, log=False):
	flags_to_watch = {
		# 'combined': False,
		'expenditures': False,
		'balance': False,
		'revenue': False,
		# 'schedule': False
	}

	# Loop over all text in the query bag and compare it to our dict
	for word in query_vector:
		for buzz_word in flags_to_watch:
			similar_ratio = SequenceMatcher(None, word.lower(), buzz_word.lower())

			if similar_ratio.ratio() > SIMILARITY_THRESHOLD:
				flags_to_watch[buzz_word] = True
				break
	if log:
		print(' '.join(query_vector))
		print(flags_to_watch)

	# If not all the flags are met return False
	return all(flag for flag in flags_to_watch.values())
	

In [None]:
IGNORE_STRING_1 = 'no tax increment project expenditures' # First seen 1997
IGNORE_STRING_2 = 'no tax increment expenditures within the project area' # First seen 1998_4
IGNORE_STRING_3 = 'no tax increment expenditures or cumulative deposits over' # First seen 2002_10

def is_ignored(query_vector):
	# Join all elements of vector
	doc_string = ' '.join(query_vector)
	doc_string = doc_string.lower()

	return (re.search(IGNORE_STRING_1, doc_string) is not None) or (re.search(IGNORE_STRING_2, doc_string) is not None) or (re.search(IGNORE_STRING_3, doc_string) is not None)

In [None]:
if not os.path.exists('rect_table_images'):
	os.makedirs('rect_table_images')

if not os.path.exists('table_images'):
	os.makedirs('table_images')

In [None]:

from page_dictionary import SKIP_LIST, MANUAL_CORRECTIONS, HAND_FILLED


def find_stuff(pair):


	if pair in SKIP_LIST:
		return False

	# Load in the associated TIF csv
	csv_path = os.path.join(PARSED_PDFS_DIR, f'{pair}.csv')
	tif_text = pd.read_csv(csv_path, header=None, names=DATABASE_FIELDS)

	# Only grab top 550 px
	top_section = tif_text[tif_text['top'] <= 550]

	# Sort all words so that they appear in order
	top_section.sort_values(['page_num', 'block_num', 'line_num', 'word_num'])

	# Get a list of all the pages that have words in the top section
	pages = top_section['page_num'].unique()

	if pair in MANUAL_CORRECTIONS:
		pages = [MANUAL_CORRECTIONS[pair], tif_text[tif_text['page_num'] == MANUAL_CORRECTIONS[pair]]]
		return pages
	
	if pair in HAND_FILLED:
		pages = [HAND_FILLED[pair], tif_text[tif_text['page_num'] == HAND_FILLED[pair]]]
		return pages

	matched_pages = []

	for page in pages:

		page_df = top_section[top_section['page_num'] == page]
		page_vector = page_df['text']

		if is_finance(page_vector.to_list()):
			matched_pages.append(page)

	# print(pair, matched_pages)

	# These all have exactly 2 matches (when not 0)
	if pair[0:4] in ['2007', '2008', '2009'] and len(matched_pages) == 2:
		matched_pages = [matched_pages[0]]
		print('Corrected', matched_pages)
		return [matched_pages[0], tif_text[tif_text['page_num'] == int(matched_pages[0])]]

	elif len(matched_pages) > 1:
		print('Too many pages')
		return False

	if len(matched_pages) > 0:
		return [matched_pages[0], tif_text[tif_text['page_num'] == int(matched_pages[0])]]

	resolved = False
	
	pages = page_status[pair]['successful']

	# Check the contents of every page and look for the string 'no city
	# contracts related to the project area'
	for page in pages:

		page_df = tif_text[tif_text['page_num'] == int(page)]
		page_vector = page_df['text']

		if is_ignored(page_vector.to_list()):
			print(f'gotta killer. Page {page}')
			# Should be useless because all of these were manually collected,
			# but just in case
			if pair[0:4] in ['1999', '2000', '2001'] :
				continue
			resolved = True
			break

	if resolved:
		return False

	# The statement of revenues might be lower down on the page. Not all reports
	# actually follow the same format in a year so a document like this needs to
	# be flagged for manual review.
	top_section = tif_text[tif_text['top'] <= 750]

	# Sort all words so that they appear in order
	top_section.sort_values(['page_num', 'block_num', 'line_num', 'word_num'])

	# Get a list of all the pages that have words in the top section
	pages = top_section['page_num'].unique()

	for page in pages:

		page_df = top_section[top_section['page_num'] == page]
		page_vector = page_df['text']

		if is_finance(page_vector.to_list()):
			matched_pages.append(page)

	if len(matched_pages) > 1:
		print('hey too many!')

	if len(matched_pages) > 0:
		resolved = True
		print(pair, matched_pages)
		return [matched_pages[0], tif_text[tif_text['page_num'] == int(matched_pages[0])]]
	
	if pair[0:4] in ['1999', '2000', '2001'] :
		return False

	if not resolved:
		print(pages)

		print(pair)

		print("Couldn't resolve")


for pair in page_status:

	print(pair)
	if os.path.exists(os.path.join('table_images', f'{pair}.png')):
		continue

	poi = find_stuff(pair)
	# print(pair, 'found', poi)


	if poi is False:
		continue

	print(poi[0])


	# Load pdf
	with fitz.open(os.path.join('TIFpdfs', f'{pair}.pdf')) as pdf:
		
		page_num = poi[0]
		
		page = pdf.load_page(int(page_num))


		pixmap = page.get_pixmap(dpi=300)
		image = Image.frombytes('RGB', [pixmap.width, pixmap.height], pixmap.samples)

		image.save(os.path.join('rect_table_images', f'{pair}.png'), jpg_quality=100, format="PNG")
		
		image_bytes = io.BytesIO()

		rect_image = ImageDraw.Draw(image)

		if len(poi) != 2:
			print(f'{pair} didn\'t return correctly')
			continue

		for _, text in poi[1].iterrows():
			rect = (text['left'] - 5, text['top'] - 5, text['left'] + text['width'] + 5, text['top'] + text['height'] + 5)
			rect_image.rectangle(rect, outline='red', width=4)

		image.save(os.path.join('table_images', f'{pair}.png'), jpg_quality=100, format="PNG")

print('Done')