# Code for testing chart data reconstruction from image

In [None]:
# imports
import os
from typing import Optional
import cv2 as cv
import numpy as np
from matplotlib import pyplot as plt
import pytesseract
from pytesseract import Output
import re
from datetime import datetime, timedelta

In [None]:
filepath_output_root = "data/scraped/camel/charts"
filepath_sample_image = os.path.join(filepath_output_root, "camelcamelcamel-B07B428M7F.png")

In [None]:
# some utility functions

def show(img):
  # shows image
	w = img.shape[1]
	h = img.shape[0]
	a = w/h
	# show image with 1200px width and proportional height
	plt.figure(figsize=(12,12/a))
	plt.imshow(img)

def mask_image(img: np.ndarray,
               color: tuple,
               color_range: Optional[tuple] = None) -> np.ndarray:
	'''
		Returns a masked image where only the pixels of the specified color are kept.
		If two colors are specified, the pixels between the two colors are kept.
	'''
	img_copy = img.copy()
	if color_range is None:
		color_range = color
	img_masked = cv.inRange(img_copy, np.array(color), np.array(color_range))
	return img_masked

In [None]:
# load image from file as sample_image_original (BGR2RGB)
sample_image_original = cv.imread(filepath_sample_image)
sample_image_original = cv.cvtColor(sample_image_original, cv.COLOR_BGR2RGB)
# print w, h, a
w = sample_image_original.shape[1]
h = sample_image_original.shape[0]
a = w/h
print(f"w: {w}, h: {h}, a: {a}")

# print image shape (numpy)
print(f"sample_image_original.shape: {sample_image_original.shape}")

In [None]:
def save_image(img: np.ndarray, filename: str, invert: bool = True) -> None:
	'''
		Saves image to file in the filepath_output_root directory.
	'''
	filepath = os.path.join(filepath_output_root, filename)
	if invert:
		img_reverse = cv.cvtColor(img, cv.COLOR_RGB2BGR)
		cv.imwrite(filepath, img_reverse)
	else:
		cv.imwrite(filepath, img)

In [None]:
show(sample_image_original)
save_image(sample_image_original, "cv-demo-original.png")

In [None]:
# unique_colors = get_all_unique_colors(sample_image_original)
# print(unique_colors)

In [None]:
# segment into two images - one image which only keeps the (#1D35CB powertoys color picker / #0033cc photoshop) color and one which keeps everything else
# image_segmented_line should only contain one color
# image_segmented_line = cv.inRange(sample_image_original, np.array([0, 51, 204]), np.array([0, 51, 204]))
image_segmented_line = mask_image(sample_image_original, (0, 51, 204))

# save to disk - same path as original image but with .png replaced with -segmented.png (in RGB)
# filepath_sample_image_segmented = filepath_sample_image.replace(".png", "-segmented.png")
# cv.imwrite(filepath_sample_image_segmented, image_segmented_line)
show(image_segmented_line)
save_image(image_segmented_line, "cv-demo-segmented-plot.png")

# # decrease line thickness
# kernel = np.ones((3,3),np.uint8)
# image_segmented_line = cv.erode(image_segmented_line,kernel,iterations = 1)

# display image (RGB)
# plt.imshow(cv.cvtColor(image_segmented_line, cv.COLOR_BGR2RGB))
# show(image_segmented_line)
# image_segmented_line = cv.cvtColor(image_segmented_line, cv.COLOR_GRAY2RGB)
# # display image (RGB)
# # plt.imshow(cv.cvtColor(image_segmented, cv.COLOR_BGR2RGB))
# show(image_segmented_line)

# get unique values in resulting image
# unique_values = np.unique(image_segmented_line)
# print(f"unique_values: {unique_values}")

# get counts of unique values in resulting image
unique_values, counts = np.unique(image_segmented_line, return_counts=True)
print(f"unique_values: {unique_values} counts: {counts}")

In [None]:
# Key algo for getting chart line coordinates
def get_vertical_pixel_indices(img):
	'''
		Returns a list (length is same as img) where each element is a an index
		of the center of the first and last non-zero pixel in the column (vertical pixel line)
		If no non-zero pixel if found in a column the element at that index equals -1.

		Useful for getting the averaged coordinates of the chart line.
		Example: element in list at index 3 has a value of 5 -> (3, 5) ; (x, y) ; (width_i, height_i) of the image
		The coordinate system for images starts at the top left corner of the image.
	'''
	# make a copy of the image so we don't modify the original
	img_copy = img.copy()
	# mask the image to only keep the line color
	img_masked = cv.inRange(img_copy, np.array([0, 51, 204]), np.array([0, 51, 204]))
	# show(img_masked)
	# # convert image to black and white
	# # Set the threshold value
	# threshold_value = 127
	# # Set the maximum value
	# max_value = 255
	# # Convert the image to grayscale if it is not already
	# gray_img = img_masked
	# if len(img_copy.shape) > 2:
	# 	gray_img = cv.cvtColor(img_copy, cv.COLOR_RGB2GRAY)
	# # Threshold the image
	# ret, bw_img = cv.threshold(gray_img, threshold_value, max_value, cv.THRESH_BINARY)
	# # Invert the image // no longer needed - here for reference
	# bw_img = cv.bitwise_not(bw_img)
	vertical_pixel_indices = []
	# slice the image into vertical pixel segments
	# arr_img = np.array(bw_img)
	# arr_img = np.array(img_masked)
	# # transpose the array to make it easier to loop through each column (now row)
	# arr_img = arr_img.T
	# loop through each row
	for i, row in enumerate(img_masked.T):
		# Find the indices of the non-zero elements
		nz_indices = np.nonzero(row)[0]
		# If there are no non-zero elements, add -1 to the list
		if len(nz_indices) == 0:
			vertical_pixel_indices.append(-1)
			continue
		# Find the first and last non-zero indices
		first_nz_index = nz_indices[0]
		last_nz_index = nz_indices[-1]
		# Find the average of the first and last non-zero indices
		avg_nz_index = (first_nz_index + last_nz_index) / 2
		# Get the integer part of the average index
		avg_nz_index = int(avg_nz_index)
		# Add the average index to the list
		vertical_pixel_indices.append(avg_nz_index)
	# return the list
	return vertical_pixel_indices

filepath_img16x9bw = "src/py/scraping/camel/16x9bw.png"
# img16x9bw = cv.cvtColor(cv.imread(filepath_img16x9bw), cv.COLOR_BGR2RGB)
# show(img16x9bw)

# demo on the real deal
filepath_img16x9bw = filepath_sample_image
img16x9bw =  cv.cvtColor(cv.imread(filepath_sample_image), cv.COLOR_BGR2RGB)
img16x9bw = cv.inRange(sample_image_original, np.array([0, 51, 204]), np.array([0, 51, 204]))
# show(img16x9bw)

# get the vertical pixel indices
vertical_pixel_indices = get_vertical_pixel_indices(sample_image_original)

# if the image has 2 dimensions, convert to 3 dimensions
if len(img16x9bw.shape) == 2:
	img16x9bw = cv.cvtColor(img16x9bw, cv.COLOR_GRAY2RGB)

# paint the vertical pixel indices on the image with red
# for i in range(len(vertical_pixel_indices)):
# 	vertical_pixel_index = vertical_pixel_indices[i]
# 	if vertical_pixel_index != -1:
# 		img16x9bw[vertical_pixel_index, i] = [255, 0, 0]

# connect the vertical pixel indices with a 1 pixel wide red line
for i in range(len(vertical_pixel_indices) - 1):
	vertical_pixel_index = vertical_pixel_indices[i]
	next_vertical_pixel_index = vertical_pixel_indices[i + 1]
	if vertical_pixel_index != -1 and next_vertical_pixel_index != -1:
		cv.line(img16x9bw, (i, vertical_pixel_index), (i + 1, next_vertical_pixel_index), (255, 0, 0), 1)


# show the image
show(img16x9bw)

# change to RGB from BGR
# img16x9bw = cv.cvtColor(img16x9bw, cv.COLOR_BGR2RGB)

# save to disk
save_image(img16x9bw, "cv-demo-vertical-pixel-indices.png")

# TODO:

We can now trace the line. We now need to:
* find 0,0 pixels (axis origin)
* find the axis labels (their values and their positions)
* find the axis ticks (their values and their positions)
* how to consistently mask the legend and text so it doesn't affect the line mask
* calculate pixel to value ratio for each axis (and how accurate it is)
* reconstruct the chart in library of choice (matplotlib, plotly, etc.)
* compare the reconstructed chart to the original chart
* save raw data to csv (timestamp, price) = (x, y)


In [None]:
# Getting the axis locations
def get_axis_locations(img):
	'''
		Find the locations of the x and y axes in the image.
	'''
	# Make a copy of the image so we don't modify the original
	img_copy = img.copy()
	# Mask the image
	img_mask = cv.inRange(img_copy, np.array([51, 51, 51]), np.array([51, 51, 51]))
	show(img_mask)
	# create an array with only zeros the size of image width and then another with only zeros the size of image height
	scanline_x = np.logical_not(np.zeros(img_mask.shape[1]))
	scanline_y = np.logical_not(np.zeros(img_mask.shape[0]))
	highest_match_count_x = 0
	highest_match_count_y = 0
	highest_match_count_x_indices = []
	highest_match_count_y_indices = []
	# loop through each row and use nand to find the number of matches
	for i, row in enumerate(img_mask):
		# find the number of matches
		match_count = np.sum(np.logical_and(row, scanline_x))
		# if the match count is higher than the highest match count, set the highest match count to the match count
		if match_count > highest_match_count_x:
			highest_match_count_x = match_count
			highest_match_count_x_indices = [i]
		# if the match count is equal to the highest match count, append the index to the highest match count indices
		elif match_count == highest_match_count_x:
			highest_match_count_x_indices.append(i)
	# loop through each column (image transposed to rows) and use nand to find the number of matches
	for i, column in enumerate(img_mask.T):
		# find the number of matches
		match_count = np.sum(np.logical_and(column, scanline_y))
		# if the match count is higher than the highest match count, set the highest match count to the match count
		if match_count > highest_match_count_y:
			highest_match_count_y = match_count
			highest_match_count_y_indices = [i]
		# if the match count is equal to the highest match count, append the index to the highest match count indices
		elif match_count == highest_match_count_y:
			highest_match_count_y_indices.append(i)
	# find the average of the highest match count indices
	avg_highest_match_count_x_index = int(np.average(highest_match_count_x_indices))
	avg_highest_match_count_y_index = int(np.average(highest_match_count_y_indices))
	# return the average highest match count indices
	return avg_highest_match_count_x_index, avg_highest_match_count_y_index

  
# load image from file as sample_image_original (BGR2RGB)
img = cv.cvtColor(cv.imread(filepath_sample_image), cv.COLOR_BGR2RGB)
axis_locations = get_axis_locations(img)

# draw a line at the x axis location
cv.line(img, (0, axis_locations[0]), (img.shape[1], axis_locations[0]), (0, 255, 0), 3)
# draw a line at the y axis location
cv.line(img, (axis_locations[1], 0), (axis_locations[1], img.shape[0]), (0, 255, 0), 3)
# show the image
show(img)
# save to disk
save_image(img, "cv-demo-axis-locations.png")


In [None]:
def get_line_indices_generic(img: np.ndarray, is_horizontal: bool, color: tuple, match_threshold: float = 0.2) -> list:
	'''
		Find the locations of the lines in the image.
	'''
	# mask the image
	img_mask = mask_image(img, color)
	# show(img_mask)
	scanline = None
	# generate scanline
	if is_horizontal:
		scanline = np.logical_not(np.zeros(img_mask.shape[1]))
	else:
		scanline = np.logical_not(np.zeros(img_mask.shape[0]))
	matches_indices = []
	# loop through each row and use logical and to find the number of matches
	img_mask = img_mask if is_horizontal else img_mask.T
	for i, row in enumerate(img_mask):
		match_count = np.sum(np.logical_and(row, scanline))
		match_ratio = match_count / img_mask.shape[1]
		if match_ratio > match_threshold:
			matches_indices.append(i)
	return matches_indices

img = cv.cvtColor(cv.imread(filepath_sample_image), cv.COLOR_BGR2RGB)
# get the horizontal gridlines
horizontal_gridlines = get_line_indices_generic(img, True, (221, 221, 221))
# get the vertical gridlines
vertical_gridlines = get_line_indices_generic(img, False, (221, 221, 221))
# draw the horizontal gridlines
for i in horizontal_gridlines:
	cv.line(img, (0, i), (img.shape[1], i), (255, 0, 255), 3)
# draw the vertical gridlines
for i in vertical_gridlines:
	cv.line(img, (i, 0), (i, img.shape[0]), (0, 255, 255), 3)
# get price max and min indices
price_max_index = get_line_indices_generic(img, True, (194, 68, 68), 0.2)[0]
price_min_index = get_line_indices_generic(img, True, (119, 195, 107), 0.2)[0]
cv.line(img, (0, price_max_index), (img.shape[1], price_max_index), (255, 127, 0), 3)
cv.line(img, (0, price_min_index), (img.shape[1], price_min_index), (255, 127, 0), 3)
# show the image
show(img)
# save to disk
save_image(img, "cv-demo-gridlines.png")

In [None]:
def get_consecutive_elements(arr):
	consecutive_lists = []
	current_list = [arr[0]]
	for i in range(1, len(arr)):
			if arr[i] == arr[i-1] + 1:
					current_list.append(arr[i])
			else:
					consecutive_lists.append(current_list)
					current_list = [arr[i]]
	consecutive_lists.append(current_list)
	return consecutive_lists


def merge_consecutive_elements(arr):
	merged_list = []
	for els in arr:
		merged_list.append(int(np.average(els)))
	return merged_list


# arr = np.array([33, 34, 100, 101, 102, 105, 108])
# consecutive_lists = get_consecutive_elements(arr)
# print(consecutive_lists)
# merged_list = merge_consecutive_elements(consecutive_lists)
# print(merged_list)
# a = 0

# Getting major grid locations
def get_major_grid_locations(img):
	'''
		Find the locations of the x and y grid locations in the image.
	'''
	# make a copy of the image so we don't modify the original
	img_copy = img.copy()
	# mask the image
	img_mask = cv.inRange(img_copy, np.array([221, 221, 221]), np.array([221, 221, 221]))
	# show(img_mask)
	# create an array with only zeros the size of image width and then another with only zeros the size of image height
	ratio_to_match = 0.2 # ratio of the image width or height must be white to be considered a major grid line
	scanline_x = np.logical_not(np.zeros(img_mask.shape[1]))
	scanline_y = np.logical_not(np.zeros(img_mask.shape[0]))
	matches_x_indices = []
	matches_y_indices = []
	# loop through each row and use nand to find the number of matches
	for i, row in enumerate(img_mask):
		# find the number of matches
		# match_count = np.sum(np.logical_and(np.logical_not(row), np.logical_not(scanline_x)))
		match_count = np.sum(np.logical_and(row, scanline_x))
		match_ratio = match_count / img_mask.shape[1]
		if match_ratio > ratio_to_match:
			matches_x_indices.append(i)
	# loop through each column (image transposed to rows) and use nand to find the number of matches
	for i, column in enumerate(img_mask.T):
		# find the number of matches
		# match_count = np.sum(np.logical_and(np.logical_not(column), np.logical_not(scanline_y)))
		match_count = np.sum(np.logical_and(column, scanline_y))
		match_ratio = match_count / img_mask.shape[0]
		if match_ratio > ratio_to_match:
			matches_y_indices.append(i)
	# return the gridlines
	return matches_x_indices, matches_y_indices

def get_rightmost_minor_gridline(img):
	'''
		Find the location of the rightmost minor gridline in the image.
		Also serves as the right side of the bounding box for the graph (line chart).
	'''
	# make a copy of the image so we don't modify the original
	img_copy = img.copy()
	# mask the image
	img_mask = cv.inRange(img_copy, np.array([245, 245, 245]), np.array([245, 245, 245]))
	# show(img_mask)
	# create an array with only zeros the size of image height
	scanline_y = np.logical_not(np.zeros(img_mask.shape[0]))
	matches_y_indices = []
	accepable_match_ratio = 0.2 # ratio of the image height must be white to be considered a major grid line
	for i, column in enumerate(img_mask.T):
		# find the number of matches
		match_count = np.sum(np.logical_and(column, scanline_y))
		match_ratio = match_count / img_mask.shape[0]
		if match_ratio > accepable_match_ratio:
			matches_y_indices.append(i)
	# return the highest index
	return max(matches_y_indices)

  
# load image from file as sample_image_original (BGR2RGB)
img = cv.cvtColor(cv.imread(filepath_sample_image), cv.COLOR_BGR2RGB)
major_grid_locations_x, major_grid_locations_y = get_major_grid_locations(img)

rightmost_minor_gridline = get_rightmost_minor_gridline(img)

major_grid_locations_x = merge_consecutive_elements(get_consecutive_elements(major_grid_locations_x))
major_grid_locations_y = merge_consecutive_elements(get_consecutive_elements(major_grid_locations_y))

# draw a horizonal red line at the x axis location for each major grid location
min_x = min(major_grid_locations_x) # top of the graph (line chart) bounding box
for x in major_grid_locations_x:
	if x == min_x:
		cv.line(img, (0, x), (img.shape[1], x), (0, 255, 0), 3)
	# else:
	# 	cv.line(img, (0, x), (img.shape[1], x), (255, 0, 0), 3)
# draw a vertical red line at the y axis location for each major grid location
# for y in major_grid_locations_y:
# 	cv.line(img, (y, 0), (y, img.shape[0]), (255, 0, 0), 3)
# draw a vertical cyan line at the rightmost minor gridline location
cv.line(img, (rightmost_minor_gridline, 0), (rightmost_minor_gridline, img.shape[0]), (0, 255, 0), 3)
# show the image
# draw axis lines
cv.line(img, (0, axis_locations[0]), (img.shape[1], axis_locations[0]), (0, 255, 0), 3)
cv.line(img, (axis_locations[1], 0), (axis_locations[1], img.shape[0]), (0, 255, 0), 3)
# draw price max and min lines
cv.line(img, (0, price_max_index), (img.shape[1], price_max_index), (255, 0, 255), 3)
cv.line(img, (0, price_min_index), (img.shape[1], price_min_index), (255, 0, 255), 3)
show(img)
save_image(img, "cv-demo-bounding-box.png")

In [None]:
months = set(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'nov', 'dec'])

def clean_str(s):
	'''
		Remove all non-alphanumeric characters from a string.
	'''
	s_clean = s.replace(' ', '')
	s_clean = s_clean.strip()
	s_clean = s_clean.lower()
	s_clean = re.sub('[^0-9a-zA-Z]+', '', s_clean)
	return s_clean

def tesseract_data_object_to_list_of_objects(data):
	'''
		Convert the tesseract data object to a list of objects.
	'''
	data_list = []
	for i in range(len(data["text"])):
		el = {}
		for k in data:
			el[k] = data[k][i]
		data_list.append(el)
	return data_list

# Read all of the text from the image

def get_y_axis_label_candidates(candidates):
	'''
		Find the y axis label candidates.
	'''
	# find the y axis label candidates
	y_axis_label_candidates = []
	for candidate in candidates:
		# get the text from the candidate
		text = candidate['text']
		if "$" in text and "." not in text:
			y_axis_label_candidates.append(candidate)
	return y_axis_label_candidates

def get_x_axis_month_label_candidates(candidates):
	'''
		Find the x axis month label candidates.
	'''
	# find the x axis month label candidates
	x_axis_month_label_candidates = []
	for candidate in candidates:
		# get the text from the candidate
		text = candidate['text']
		if clean_str(text) in months:
			x_axis_month_label_candidates.append(candidate)
	return x_axis_month_label_candidates

def get_month_labels_y_location(candidates):
	'''
		Find the y axis location of the month labels. Used for determining x axis year label candidates.
	'''
	y_locations = {}
	for candidate in candidates:
		y = str(candidate['top'])
		if y not in y_locations:
			y_locations[y] = []
		y_locations[y].append(candidate)
	# y_location is the key with the most values
	y_location = int(max(y_locations, key=lambda k: len(y_locations[k])))
	# return the y location of the month labels
	return y_location

def get_x_axis_year_label_candidates(candidates, y_location):
	'''
		Find the x axis year label candidates.
	'''
	# find the x axis year label candidates
	x_axis_year_label_candidates = []
	for candidate in candidates:
		# get the text from the candidate
		text = candidate['text']
		# get the y location from the candidate
		y = candidate['top']
		# if the text is a digit and the y location is near (3 pixels + or -) the y location of the month labels
		if text.isdigit() and abs(y - y_location) <= 3:
			x_axis_year_label_candidates.append(candidate)
	return x_axis_year_label_candidates

def get_text_data(img):
	'''
		Get the text data from the image.
	'''
	# show(img)
	gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
	# show(gray)
	thresh = cv.threshold(gray, 0, 255, cv.THRESH_BINARY_INV + cv.THRESH_OTSU)[1]
	# show(thresh)
	kernel = cv.getStructuringElement(cv.MORPH_RECT, (3,3))
	# show(kernel)
	dilate = cv.dilate(thresh, kernel, iterations=1)
	# show(dilate)
	# boxes = pytesseract.image_to_data(dilate, output_type=Output.DICT)
	# boxes = pytesseract.image_to_data(dilate, output_type=Output.DICT, config='--psm 6')
	# boxes = pytesseract.image_to_data(dilate, output_type=Output.DICT, config='--psm 6 -c tessedit_char_whitelist=0123456789')
	# boxes = pytesseract.image_to_data(dilate, output_type=Output.DICT, config='--psm 6 --oem 3')
	# boxes = pytesseract.image_to_data(img, output_type=Output.DICT, config='--psm 6 --oem 3') 
	boxes = pytesseract.image_to_data(gray, output_type=Output.DICT, config='--psm 6 --oem 3') # currently the best
	return boxes

img = cv.cvtColor(cv.imread(filepath_sample_image), cv.COLOR_BGR2RGB)
boxes = get_text_data(img)
conf = 80 #90
for i in range(len(boxes['text'])):
	if int(boxes['conf'][i]) > conf:
		(x, y, w, h) = (boxes['left'][i], boxes['top'][i], boxes['width'][i], boxes['height'][i])
		cv.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
# show(img)

# convert the tesseract data object to a list of objects
boxes = tesseract_data_object_to_list_of_objects(boxes)
# filter out the non-text boxes
boxes = [b for b in boxes if b['text'].strip() != '']
# filter out the boxes with low confidence
boxes = [b for b in boxes if int(b['conf']) > conf]

y_label_candidates = get_y_axis_label_candidates(boxes)
# print candidates text
print("y axis (price) label candidates:")
for candidate in y_label_candidates:
	print(f"'{candidate['text']}' at ({candidate['left']}, {candidate['top']})")

x_month_candidates = get_x_axis_month_label_candidates(boxes)
x_month_y_location = get_month_labels_y_location(x_month_candidates)

x_year_candidates = get_x_axis_year_label_candidates(boxes, x_month_y_location)
# print candidates text
print("x axis (year) label candidates:")
for candidate in x_year_candidates:
	print(f"'{candidate['text']}' at ({candidate['left']}, {candidate['top']})")

# print the text from the image and the confidence of each result
# for i in range(len(boxes['text'])):
# 	if int(boxes['conf'][i]) > conf:
# 		text = clean_str(boxes['text'][i])
# 		if ("$" in text and "." not in text) or (text in months):
# 			print(f"'{text}': {boxes['conf'][i]} % at ({boxes['left'][i]}, {boxes['top'][i]}) ; original string: '{boxes['text'][i]}'")

# paint the boinding boxes of x and y axis labels on the image in magenta
for candidate in y_label_candidates:
	(x, y, w, h) = (candidate['left'], candidate['top'], candidate['width'], candidate['height'])
	cv.rectangle(img, (x, y), (x + w, y + h), (255, 0, 255), 3)
# for candidate in x_month_candidates:
# 	(x, y, w, h) = (candidate['left'], candidate['top'], candidate['width'], candidate['height'])
# 	cv.rectangle(img, (x, y), (x + w, y + h), (255, 0, 255), 2)
for candidate in x_year_candidates:
	(x, y, w, h) = (candidate['left'], candidate['top'], candidate['width'], candidate['height'])
	cv.rectangle(img, (x, y), (x + w, y + h), (255, 0, 255), 3)
show(img)
save_image(img, "cv-demo-text-data.png")

In [None]:
# Now we have to bring it all together - first we have to match the x and y axis labels with the major gridlines
img = cv.cvtColor(cv.imread(filepath_sample_image), cv.COLOR_BGR2RGB)
x_axis_location, y_axis_location = get_axis_locations(img)
rightmost_minor_gridline = get_rightmost_minor_gridline(img)
print(f"x axis location: {x_axis_location}")
print(f"y axis location: {y_axis_location}")
print(f"rightmost minor gridline location (right side of the bounding box): {rightmost_minor_gridline}")
# get gridlines
grid_y_locations, grid_x_locations = get_major_grid_locations(img)
grid_x_locations = merge_consecutive_elements(get_consecutive_elements(grid_x_locations))
grid_y_locations = merge_consecutive_elements(get_consecutive_elements(grid_y_locations))
# append the x axis to the gridlines as well
grid_y_locations.append(x_axis_location)
print(f"grid x locations: {grid_x_locations}")
print(f"grid y locations: {grid_y_locations}")
# get text from image
text_data = get_text_data(img)
text_data = tesseract_data_object_to_list_of_objects(text_data)
# get y axis labels
y_axis_label_candidates = get_y_axis_label_candidates(text_data)
y_axis_labels = [candidate['text'] for candidate in y_axis_label_candidates]
print(f"y axis labels: {y_axis_labels}")
# get x axis labels
x_month_candidates = get_x_axis_month_label_candidates(boxes)
x_month_y_location = get_month_labels_y_location(x_month_candidates)
x_axis_candidates = get_x_axis_year_label_candidates(boxes, x_month_y_location)
x_axis_labels = [candidate['text'] for candidate in x_axis_candidates]
print(f"x axis labels: {x_axis_labels}")

def match_x_axis_label_candidates_with_gridlines(x_axis_label_candidates, grid_x_locations):
	'''
		Tries matching the x axis label candidates center with the closest gridline.
	'''
	matches = []
	biggest_distance = 5 # the horizontal distance between the center of the label and the gridline
	for candidate in x_axis_label_candidates:
		center = candidate['left'] + (candidate['width'] / 2)
		closest_gridline = min(grid_x_locations, key=lambda x:abs(x-center))
		if abs(center - closest_gridline) < biggest_distance:
			matches.append((candidate, closest_gridline))
	return matches

def match_y_axis_label_candidates_with_gridlines(y_axis_label_candidates, grid_y_locations):
	'''
		Tries matching the y axis label candidates center with the closest gridline.
	'''
	matches = []
	biggest_distance = 5 # the vertical distance between the center of the label and the gridline
	for candidate in y_axis_label_candidates:
		center = candidate['top'] + (candidate['height'] / 2)
		closest_gridline = min(grid_y_locations, key=lambda y:abs(y-center))
		if abs(center - closest_gridline) < biggest_distance:
			matches.append((candidate, closest_gridline))
	return matches
	

grid_x_matches = match_x_axis_label_candidates_with_gridlines(x_axis_candidates, grid_x_locations)
# print matches (label, candidate center, and gridline)
print("x axis label matches to gridline positions:")
for match in grid_x_matches:
	print(f"{match[0]['text']}: {match[0]['left'] + match[0]['width'] / 2} -> {match[1]}")

grid_y_matches = match_y_axis_label_candidates_with_gridlines(y_axis_label_candidates, grid_y_locations)
# print matches (label, candidate center, and gridline)
print("y axis label matches to gridline positions:")
for match in grid_y_matches:
	print(f"{match[0]['text']}: {match[0]['top'] + match[0]['height'] / 2} -> {match[1]}")

# getting the extreme values for the most accurate calculations
x_pair_smallest = min(grid_x_matches, key=lambda x: x[1])
x_pair_largest = max(grid_x_matches, key=lambda x: x[1])
y_pair_smallest = min(grid_y_matches, key=lambda y: y[1])
y_pair_largest = max(grid_y_matches, key=lambda y: y[1])
# print the extreme values (table text, gridline)
print(f"smallest x value: {x_pair_smallest[0]['text']} @ {x_pair_smallest[1]}")
print(f"largest x value: {x_pair_largest[0]['text']} @ {x_pair_largest[1]}")
# print(f"smallest y value: {y_pair_smallest[0]['text']} @ {y_pair_smallest[1]}")
# print(f"largest y value: {y_pair_largest[0]['text']} @ {y_pair_largest[1]}")
print(f"smallest y value: {y_pair_largest[0]['text']} @ {y_pair_largest[1]}")
print(f"largest y value: {y_pair_smallest[0]['text']} @ {y_pair_smallest[1]}")

# calculate the x and y axis pixel to value ratio (resolution)s
x_smallest_timestamp = datetime(year=int(x_pair_smallest[0]['text']), month=1, day=1)
x_largest_timestamp = datetime(year=int(x_pair_largest[0]['text']), month=1, day=1)
# print the timestamps as human readable
print(f"smallest x timestamp: {x_smallest_timestamp}")
print(f"largest x timestamp: {x_largest_timestamp}")
# calculate the y axis resolution
y_smallest_value = float(clean_str(y_pair_smallest[0]['text']))
y_largest_value = float(clean_str(y_pair_largest[0]['text']))
# print(f"smallest y value: {y_smallest_value} $")
# print(f"largest y value: {y_largest_value} $")
print(f"smallest y value: {y_largest_value} $")
print(f"largest y value: {y_smallest_value} $")

# calculate different in x and y axis
x_axis_difference_value = (x_largest_timestamp - x_smallest_timestamp).total_seconds()
y_axis_difference_value = abs(y_largest_value - y_smallest_value)
# calculate the difference in pixels between the extreme values
x_axis_difference_pixels = abs(x_pair_largest[1] - x_pair_smallest[1])
y_axis_difference_pixels = abs(y_pair_largest[1] - y_pair_smallest[1])
print(f"x axis difference: {x_axis_difference_value} seconds per {x_axis_difference_pixels} pixels")
print(f"y axis difference: {y_axis_difference_value} $ per {y_axis_difference_pixels} pixels")
# print how much is 1 pixel in x and y axis
x_seconds_per_pixel = x_axis_difference_value / x_axis_difference_pixels
y_dollars_per_pixel = y_axis_difference_value / y_axis_difference_pixels
print(f"1 pixel on x axis is {x_seconds_per_pixel} seconds ({x_seconds_per_pixel / 86400} days)")
print(f"1 pixel on y axis is {y_dollars_per_pixel} $")

# draw a horizontal line at the y axis label location
cv.line(img, (0, int(x_axis_location)), (img.shape[1], int(x_axis_location)), (255, 0, 0), 3)
# draw a vertical line at the x axis label location
cv.line(img, (int(y_axis_location), 0), (int(y_axis_location), img.shape[0]), (255, 0, 0), 3)
# draw a vertical cyan line at the rightmost_minor_gridline
cv.line(img, (int(rightmost_minor_gridline), 0), (int(rightmost_minor_gridline), img.shape[0]), (0, 255, 255), 3)
# draw gridlines
for x in grid_x_locations:
	cv.line(img, (int(x), 0), (int(x), img.shape[0]), (0, 255, 0), 3)
for y in grid_y_locations:
	cv.line(img, (0, int(y)), (img.shape[1], int(y)), (0, 255, 0), 3)
# in magenta draw the x axis label candidates
for candidate in x_axis_candidates:
	cv.rectangle(img, (candidate['left'], candidate['top']), (candidate['left'] + candidate['width'], candidate['top'] + candidate['height']), (255, 0, 255), 3)
# in magenta draw the y axis label candidates
for candidate in y_axis_label_candidates:
	cv.rectangle(img, (candidate['left'], candidate['top']), (candidate['left'] + candidate['width'], candidate['top'] + candidate['height']), (255, 0, 255), 3)
show(img)
save_image(img, "cv-demo-final-overlay.png")

In [None]:
# function to calculate the timestamp of a given x axis pixel (coordinate system starts from top left)
def calculate_timestamp_from_x_axis_pixel(gridline_pair, pixel_x, x_seconds_per_pixel):
	'''
		Returns the timestamp of a given x axis pixel in relation to the selected gridline pair axis using x_seconds_per_pixel.
	'''
	gridline = gridline_pair[1]
	gridline_timestamp = datetime(year=int(gridline_pair[0]['text']), month=1, day=1)
	pixels_diff = pixel_x - gridline
	seconds_from_gridline = pixels_diff * x_seconds_per_pixel
	td = timedelta(seconds=seconds_from_gridline)
	timestamp_at_pixel_x = gridline_timestamp + td
	return timestamp_at_pixel_x

# function to calculate the value of a given y axis pixel (coordinate system starts from top left)
def calculate_value_from_y_axis_pixel(gridline_pair, pixel_y, y_dollars_per_pixel):
	'''
		Returns the value of a given y axis pixel in relation to the selected gridline pair axis using y_dollars_per_pixel.
	'''
	gridline = gridline_pair[1]
	gridline_value = float(clean_str(gridline_pair[0]['text']))
	# pixels_diff = pixel_y - gridline
	pixels_diff = gridline - pixel_y # invert the y axis - images start from top left
	dollars_from_gridline = pixels_diff * y_dollars_per_pixel
	sum_total = gridline_value + dollars_from_gridline
	return sum_total

time_of_y_axis = calculate_timestamp_from_x_axis_pixel(x_pair_smallest, y_axis_location, x_seconds_per_pixel)
value_of_x_axis = calculate_value_from_y_axis_pixel(y_pair_smallest, x_axis_location, y_dollars_per_pixel)
print(f"pixel of x axis: {y_axis_location}")
print(f"timestamp of x axis: {time_of_y_axis}")
print(f"value of y axis: {value_of_x_axis} $")

# show(img)

def mask_image_bounding_box(img, left, top, right, bottom):
	'''
		Masks the image with a bounding box.
	'''
	img_copy = img.copy()
	mask = np.zeros(img_copy.shape[:2], dtype=np.uint8)
	mask[top:bottom, left:right] = 255
	masked_img = cv.bitwise_and(img_copy, img_copy, mask=mask)
	# show(masked_img)
	return masked_img

# x_axis_location, y_axis_location = get_axis_locations(img)
# rightmost_minor_gridline = get_rightmost_minor_gridline(img)
masked_img = mask_image_bounding_box(img, y_axis_location, y_pair_smallest[1], rightmost_minor_gridline, x_axis_location)

# get vertical pixes indices from the image
vertical_pixel_indices = get_vertical_pixel_indices(masked_img)
# calculate x and y values for each pixel
extracted_data = [] # TODO: save this data to a file - this is our FINAL result
for y, x in enumerate(vertical_pixel_indices):
	if x == -1:
		# extracted_data.append((-1, -1))
		continue	
	val_at_pixel_y = calculate_value_from_y_axis_pixel(y_pair_smallest, x, y_dollars_per_pixel)
	val_at_pixel_x = calculate_timestamp_from_x_axis_pixel(x_pair_smallest, y, x_seconds_per_pixel)
	extracted_data.append((val_at_pixel_x, val_at_pixel_y))


# simulate deleting 300 datapoints at index 500
# del extracted_data[500:800]

xs = [x[0] for x in extracted_data]
ys = [x[1] for x in extracted_data]
# for i, (x, y) in enumerate(zip(xs, ys)):
# 	print(f"{i + 1}. ({x} , {y})")
plt.plot(xs, ys)
# add gridlines
plt.grid()
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Reverse engineered chart')
# plt.gcf().set_size_inches(12, 9)
# set size pixels 3200x2400
plt.gcf().set_size_inches(32, 24)
# save to disk in high resolution
plt_path = os.path.join(filepath_output_root, "cv-demo-reverse-engineered-chart.png")
# plt.savefig(plt_path, dpi=300, bbox_inches='tight')
# 3200x2400
plt.savefig(plt_path, dpi=100, bbox_inches='tight')
plt.show()
