In [1]:
import bisect
import collections
import cv2
import itertools as it
import numpy as np
import operator as op
import os
import pickle
import random

def ilen(iterable) -> int:
    counter = it.count()
    collections.deque(zip(iterable, counter), maxlen=0)
    return next(counter)

def show_and_wait(image):
    cv2.imshow('tesst', image)
    cv2.waitKey()

In [2]:
file_titles = [
    'Ancient Apparition',
    'Anti-Mage',
    'Broodmother',
    'Centaur Warrunner',
    'Clinkz',
    'Io',
    'Juggernaut',
    'Keeper of the Light',
    "Nature's Prophet",
    'Nyx Assassin',
    'Outworld Devourer',
    'Queen of Pain',
]
s = {c.lower() for c, _ in it.groupby(sorted(it.chain.from_iterable(file_titles)))}
print(len(file_titles), ''.join(sorted(s)), len(s))

12  '-abcdefghijklmnopqrstuvwxyz 29


Here is a process for determining the letters in the images.

1. Create a mean image for each hero.  A mean image is the mean of all available images for a hero.
1. Find the vertical extents of the text in the images.  This is not the vertical extents of the letters.  Adjust for the "Q" descender in "Queen of Pain".
1. Crop the images to those vertical extents for the rest of the process.
1. Find the horizontal extents of the text in the images.  This is not the horizontal extents of the letters.  This is the left edge of the left-most letter and the right edge of the right-most letter.
1. Crop the images to those horizontal extents for the rest of the process.
1. Find the horizontal extents of the letters.  Note that there are three instances of letters having overlapping bright pixels due to kerning:  "TA" and "WA" in "CENTAUR WARRUNNER" and "AT" in "NATURE'S PROPHET".
1. Extract the letter images for each Hero using the uncropped mean images.
1. Create a mask for each letter.
1. Save a dictionary of letters to 2-tuples of the tuple of letter images and the tuple of masks.

In [3]:
# Create a mean image for each hero.  A mean image is the mean of all available images for a hero.
directory_path = r'F:\Dota 2\Heroes\Pictures'
def fn(file_title):
    def fn(file_name: str):
        # Read the image.
        file_path = os.path.join(directory_path, file_name)
        image = cv2.imread(file_path)

        # Color data adds no value to this methodology.
        # Take the color channel with the lowest value.  This changes
        # the shape of the image from (60, 160, 3) to (60, 160).
        return image[:, :, np.argmin(np.sum(image.astype(np.float32), axis=(0, 1)))]

    # Read and grey all files in the Pictures directory for the given file title.
    _, _, file_names = next(os.walk(directory_path))
    g = (file_name for file_name in file_names if file_name.startswith(file_title))
    images = list(map(fn, g))

    # Combine the list of two-dimensional tensors into a single three-dimensional tensor.
    images = np.stack(images, axis=0)

    # Compute the mean image.
    mean = np.mean(images.astype(np.float32), axis=0) / 255

    # Clean up the bottom edge.
    mean[-1, :] = (mean[-2, :] + mean[0, :]) / 2
    return mean
    mean = fn(images)
    return masking_factor, mean
hero_mean_images = {s: fn(s) for s in file_titles}

# Show the hero mean images.
show_and_wait(np.vstack([i for i in hero_mean_images.values()]))
cv2.destroyAllWindows()
[(v.min(), v.mean(), v.max(), v.dtype, v.shape) for v in hero_mean_images.values()]

[(0.0607089, 0.30146945, 0.99009055, dtype('float32'), (16, 120)),
 (0.06445182, 0.30316025, 0.98991597, dtype('float32'), (16, 120)),
 (0.04990751, 0.30292377, 0.98373413, dtype('float32'), (16, 120)),
 (0.055525534, 0.29388064, 0.98892814, dtype('float32'), (16, 120)),
 (0.07709418, 0.29646266, 0.987991, dtype('float32'), (16, 120)),
 (0.06797386, 0.24183136, 0.9715106, dtype('float32'), (16, 120)),
 (0.047699343, 0.29340824, 0.9785882, dtype('float32'), (16, 120)),
 (0.06739689, 0.32595894, 0.9854226, dtype('float32'), (16, 120)),
 (0.061220814, 0.30799037, 0.9865447, dtype('float32'), (16, 120)),
 (0.05696732, 0.30735385, 0.97635293, dtype('float32'), (16, 120)),
 (0.039441675, 0.29897118, 0.98545694, dtype('float32'), (16, 120)),
 (0.058194987, 0.30815405, 0.98471, dtype('float32'), (16, 120))]

In [4]:
# Find the vertical extents of the text in the images.  This is not the vertical extents of the
# letters.  Adjust for the "Q" descender in "Queen of Pain".

# Find the maximum value of each row of each hero mean image.
d = {s: np.max(hero_mean_images[s], axis=1) for s in file_titles}

# Those that fall below the mean for the column are not in a letter.  These are the vertical
# extents.  I need these for the mean image since it's not black outside of the letters.
d = {k: v < np.mean(v) for k, v in d.items()}
d = {k: [i + 1 for i, b in enumerate(v[:-1] ^ v[1:]) if b] for k, v in d.items()}

# Adjust the vertical extents to account for the dark borders around the letters.  I want to
# include those as part of the letters.  The "Q" of "Queen of Pain" needs special handling because
# it has a descender.  For now, I'll adjust as if to ignore the "Q".
d = {k: (a - 1, b + (1 if k == 'Queen of Pain' else 2)) for k, (a, b) in d.items()}
hero_vertical_extents = d
print(hero_vertical_extents)

# Note that considering the standard deviations of the values in each row as a different means of
# obtaining the vertical extents results in the same extents as those obtained here.

# Show those values as white pixels to the right of the letters.  Offset the extents to ease
# inspecting the shown image.
def fn(image, t):
    top, bottom = t
    image = image.copy()
    right = image.shape[1] // 2
    image[:top, right:] = 1.0
    image[bottom:, right:] = 1.0
    return image
image = np.vstack(list(it.starmap(fn, zip(hero_mean_images.values(), d.values()))))
show_and_wait(image)
cv2.destroyAllWindows()
{k: (16*m+v[0],16*m+v[1],''if len(v)==2 else'not 2') for m, (k, v) in enumerate(d.items())}

{'Ancient Apparition': (6, 14), 'Anti-Mage': (5, 14), 'Broodmother': (5, 14), 'Centaur Warrunner': (6, 14), 'Clinkz': (5, 14), 'Io': (5, 14), 'Juggernaut': (5, 14), 'Keeper of the Light': (6, 14), "Nature's Prophet": (5, 14), 'Nyx Assassin': (5, 14), 'Outworld Devourer': (6, 14), 'Queen of Pain': (5, 14)}


{'Ancient Apparition': (6, 14, ''),
 'Anti-Mage': (21, 30, ''),
 'Broodmother': (37, 46, ''),
 'Centaur Warrunner': (54, 62, ''),
 'Clinkz': (69, 78, ''),
 'Io': (85, 94, ''),
 'Juggernaut': (101, 110, ''),
 'Keeper of the Light': (118, 126, ''),
 "Nature's Prophet": (133, 142, ''),
 'Nyx Assassin': (149, 158, ''),
 'Outworld Devourer': (166, 174, ''),
 'Queen of Pain': (181, 190, '')}

In [5]:
# Crop the images to those vertical extents for the rest of the process.  This does not adversely
# affect "Queen of Pain" since the descender of its "Q" is still visible.
vertically_cropped_hero_mean_images = {k: v[hero_vertical_extents[k][0]:hero_vertical_extents[k][1], :] for k, v in hero_mean_images.items()}
image = np.vstack(list(vertically_cropped_hero_mean_images.values()))
show_and_wait(image)
cv2.destroyAllWindows()

In [6]:
# Find the horizontal extents of the text in the images.  This is not the horizontal extents of
# the letters.  This is the left edge of the left-most letter and the right edge of the
# right-most letter.

# Look for significant variations along the horizontal axis.  With the current limit, it also
# captures the boundaries of the words.
d = {k: np.std(v, axis=0) > 0.011 for k, v in vertically_cropped_hero_mean_images.items()}
def fn(name, bools):
    image = vertically_cropped_hero_mean_images[name].copy()
    for i, b in enumerate(bools):
        image[-1, i] = 1.0 if b else 0.0
    return image
show_and_wait(np.vstack([fn(k, v) for k, v in d.items()]))
cv2.destroyAllWindows()

# Take the first and last change to constitute the left and right edge of each image.
def fn(bools):
    g = it.dropwhile(lambda t: not t[1], enumerate(bools))
    left = next(g)[0]
    g = it.dropwhile(lambda t: not t[1], enumerate(reversed(bools)))
    right = len(bools) - next(g)[0]
    return left, right
d = {k: fn(v) for k, v in d.items()}
hero_horizontal_extents = d
d

{'Ancient Apparition': (17, 117),
 'Anti-Mage': (36, 98),
 'Broodmother': (25, 109),
 'Centaur Warrunner': (16, 118),
 'Clinkz': (47, 87),
 'Io': (61, 74),
 'Juggernaut': (30, 104),
 'Keeper of the Light': (17, 118),
 "Nature's Prophet": (15, 119),
 'Nyx Assassin': (28, 106),
 'Outworld Devourer': (17, 117),
 'Queen of Pain': (24, 110)}

In [7]:
# Crop the images to those horizontal extents for the rest of the process.
fully_cropped_hero_mean_images = {k: v[:, hero_horizontal_extents[k][0]:hero_horizontal_extents[k][1]] for k, v in vertically_cropped_hero_mean_images.items()}
fully_cropped_max_width = max(v.shape[1] for v in fully_cropped_hero_mean_images.values())
image = np.vstack([np.hstack([v, np.zeros((v.shape[0], fully_cropped_max_width - v.shape[1]))]) for v in fully_cropped_hero_mean_images.values()])
show_and_wait(image)
cv2.destroyAllWindows()

In [8]:
# Find the horizontal extents of the letters.  Note that there are three instances of letters
# having overlapping bright pixels due to kerning:  "TA" and "WA" in "CENTAUR WARRUNNER" and "AT"
# in "NATURE'S PROPHET".

# Find the maximum of each column for each hero's fully-cropped mean image.
d = {k: [np.max(v[:, i]) for i in range(v.shape[1])] for k, v in fully_cropped_hero_mean_images.items()}
hero_maxima = d

# Assume maximum values below a threshold represent the spaces between the letters.
hero_spaces = {k: [v < .312 for v in v] for k, v in hero_maxima.items()}

# Show those values as white pixels below the letters.
def fn(image, l):
    image = image.copy()
    for i, b in enumerate(l):
        if b:
            image[-1:, i] = 1.0
    return image
g = (it.starmap(fn, zip(fully_cropped_hero_mean_images.values(), hero_spaces.values())))
show_and_wait(np.vstack([np.hstack([v, np.zeros((v.shape[0], fully_cropped_max_width - v.shape[1]))]) for v in g]))
cv2.destroyAllWindows()

# Determine the horizontal letter extents.
def fn(name, l):
    # Ignore maxima that occur at the beginning of the image.
    for i, _ in enumerate(it.takewhile(lambda v: v, l)):
        l[i] = False

    # Ignore maxima that occur at the end of the image.
    for i, _ in enumerate(it.takewhile(lambda v: v, reversed(l))):
        l[~i] = False

    g = (a ^ b for a, b in it.pairwise(l))
    g = (i + 1 for i, b in enumerate(g) if b)
    g = it.chain([0], g, [fully_cropped_hero_mean_images[name].shape[1]])
    l = list(g)
    if len(l) % 2:
        raise AssertionError()
    for i in range(1, len(l) - 1, 2):
        j = i + 1
        diff = l[j] - l[i]
        if diff:
            l[i] += 2 if 2 < diff < 4 else 1
            l[j] -= 1 if diff > 1 else 0
    g = iter(l)
    g = zip(g, g)

    # Convert the elements from tuples to lists since I need to apply fix-ups.
    return [list(t) for t in g]
hero_horizontal_letter_extents = {k: fn(k, v) for k, v in hero_spaces.items()}

# Add extents for the kerned pairs.
l = hero_horizontal_letter_extents['Centaur Warrunner']
l = l[:3] + [[l[3][0], 23], [23, l[3][1]]] + l[4:6] + [[l[6][0], 52], [52, l[6][1]]] + l[7:]
hero_horizontal_letter_extents['Centaur Warrunner'] = l
l = hero_horizontal_letter_extents["Nature's Prophet"]
l = l[:1] + [[l[1][0], 15], [15, l[1][1]]] + l[2:]
hero_horizontal_letter_extents["Nature's Prophet"] = l

# Add other fix-ups.
hero_horizontal_letter_extents['Centaur Warrunner'][7][1] += 1 # This will require an image fix-up.
hero_horizontal_letter_extents['Keeper of the Light'][7][1] += 1
hero_horizontal_letter_extents["Nature's Prophet"][0][1] += 1
hero_horizontal_letter_extents["Nature's Prophet"][1][0] += 1
hero_horizontal_letter_extents["Nature's Prophet"][7][1] += 1
hero_horizontal_letter_extents["Nature's Prophet"][10][1] += 1

# Convert the elements and their sub-elements to tuples.
hero_horizontal_letter_extents = {k: tuple(tuple(l) for l in v) for k, v in hero_horizontal_letter_extents.items()}

def fn(name, extents):
    image = np.zeros((2, fully_cropped_hero_mean_images[name].shape[1]))
    for i, (left, right) in enumerate(extents):
        image[i % 2, left:right] = 1.0
    return np.vstack([fully_cropped_hero_mean_images[name], image])
g = (fn(k, v) for k, v in hero_horizontal_letter_extents.items())
show_and_wait(np.vstack([np.hstack([v, np.zeros((v.shape[0], fully_cropped_max_width - v.shape[1]))]) for v in g]))
cv2.destroyAllWindows()
hero_horizontal_letter_extents

{'Ancient Apparition': ((0, 7),
  (7, 13),
  (13, 19),
  (19, 23),
  (23, 28),
  (28, 35),
  (35, 41),
  (43, 50),
  (50, 56),
  (56, 61),
  (61, 67),
  (67, 73),
  (73, 77),
  (77, 83),
  (83, 86),
  (86, 93),
  (93, 100)),
 'Anti-Mage': ((0, 8),
  (8, 15),
  (15, 23),
  (23, 27),
  (27, 32),
  (32, 40),
  (40, 48),
  (48, 55),
  (55, 62)),
 'Broodmother': ((0, 6),
  (6, 14),
  (14, 22),
  (22, 30),
  (30, 38),
  (38, 47),
  (47, 56),
  (56, 63),
  (63, 70),
  (70, 77),
  (77, 84)),
 'Centaur Warrunner': ((0, 6),
  (6, 11),
  (11, 18),
  (18, 23),
  (23, 30),
  (30, 35),
  (35, 42),
  (44, 53),
  (52, 59),
  (59, 65),
  (65, 71),
  (71, 77),
  (77, 84),
  (84, 90),
  (90, 95),
  (95, 102)),
 'Clinkz': ((0, 7), (7, 13), (13, 17), (17, 25), (25, 33), (33, 40)),
 'Io': ((0, 4), (4, 13)),
 'Juggernaut': ((0, 6),
  (6, 14),
  (14, 21),
  (21, 29),
  (29, 35),
  (35, 43),
  (43, 51),
  (51, 59),
  (59, 66),
  (66, 74)),
 'Keeper of the Light': ((0, 6),
  (6, 11),
  (11, 17),
  (17, 23),
  (

In [9]:
# Extract the letter images for each Hero using the uncropped mean images.

def fn(name, text_left):
    image = hero_mean_images[name]
    g = ((text_left + left, text_left + right) for left, right in hero_horizontal_letter_extents[name])
    letter_images = [image[:, left:right] for left, right in g]

    # Fix up the "W" in "CENTAUR WARRUNNER".
    if name == 'Centaur Warrunner':
        row = hero_vertical_extents[name][1] - 3
        letter_images[7][row, -1] = letter_images[7][row + 1, -1]

    return letter_images
d = {name: fn(name, left) for name, (left, _) in hero_horizontal_extents.items()}

# Verify the count of the letter images.
g = ((ilen(c for c in name if c != ' '), len(l)) for name, l in d.items())
if any(a != b for a, b in g):
    raise AssertionError()

# Organize the letter images by letter.
g = it.chain.from_iterable((zip((c.upper() for c in k if c != ' '), v)) for k, v in d.items())
l = sorted(g, key=lambda t: t[0])
g = it.groupby(l, key=lambda t: t[0])
d = {k: [v for _, v in v] for k, v in g}

max_width = max(np.hstack(l).shape[1] for l in d.values())
l = [np.hstack(l + [np.zeros((l[0].shape[0], max_width - np.hstack(l).shape[1]))]) for l in d.values()]
show_and_wait(np.vstack(l))
cv2.destroyAllWindows()
hero_letter_images = d
{k: (len(v), ' '.join([str(i.shape[1]) for i in v])) for k, v in hero_letter_images.items()}

{"'": (1, '3'),
 '-': (1, '5'),
 'A': (12, '7 7 6 8 8 7 7 8 7 9 7 7'),
 'B': (1, '6'),
 'C': (3, '6 6 7'),
 'D': (3, '8 6 7'),
 'E': (16, '5 7 7 5 5 6 5 6 5 5 6 6 4 5 6 7'),
 'F': (2, '6 6'),
 'G': (4, '7 7 8 6'),
 'H': (4, '7 7 7 8'),
 'I': (9, '4 4 3 4 4 4 4 4 4'),
 'J': (1, '6'),
 'K': (2, '8 6'),
 'L': (3, '6 5 5'),
 'M': (2, '8 9'),
 'N': (14, '6 7 7 7 7 7 6 8 8 8 8 9 8 9'),
 'O': (11, '7 8 8 9 9 8 9 7 7 6 9'),
 'P': (6, '6 5 6 6 7 7'),
 'Q': (1, '10'),
 'R': (14, '6 8 7 7 6 6 7 8 6 7 7 6 5 6'),
 'S': (5, '7 6 6 6 6'),
 'T': (11, '6 6 8 7 5 8 7 6 7 7 5'),
 'U': (8, '5 6 8 7 7 7 7 7'),
 'V': (1, '7'),
 'W': (2, '9 8'),
 'X': (1, '8'),
 'Y': (1, '7'),
 'Z': (1, '7')}

In [10]:
# Create a mask for each letter.

# The shifted cube root looks good.
def linear(v):
    return v
def sqrt(v):
    return np.sqrt(v)
def curt(v):
    return pow(v, 1/3)
def sigmoid(v):
    return 1 / (1 + np.exp(-v))
def adjusted_sigmoid(v):
    v = sigmoid(v - 0.125)
    min, max = np.min(v), np.max(v)
    return (v - min) / (max - min)
def shifted_sqrt(v):
    v = v.copy()
    v -= 0.2
    v[v < 0] = 0
    return np.sqrt(v / 0.8)
def shifted_curt(v):
    v = v.copy()
    v -= 0.1
    v[v < 0] = 0
    return pow(v / 0.9, 1/3)

# Create a mask from the pixels that are outside of the standard deviation of the grey background
# of pixels in the second and third rows.  Do this for each image since their grey backgrounds are
# different from each other.
def fn(image):
    mean, std = np.mean(image[2:4, :]), np.std(image[2:4, :])
    image = np.abs(image - mean) / std
    image = shifted_curt(image / np.max(image))
    return image.astype(np.float32)
d = {k: list(map(fn, v)) for k, v in hero_letter_images.items()}
l = [np.hstack(l + [np.zeros((l[0].shape[0], max_width - np.hstack(l).shape[1]))]) for l in d.values()]
show_and_wait(np.vstack(l))
cv2.destroyAllWindows()
hero_letter_masks = d
{k: (len(v), ' '.join([str(i.shape[1]) for i in v])) for k, v in hero_letter_masks.items()}

{"'": (1, '3'),
 '-': (1, '5'),
 'A': (12, '7 7 6 8 8 7 7 8 7 9 7 7'),
 'B': (1, '6'),
 'C': (3, '6 6 7'),
 'D': (3, '8 6 7'),
 'E': (16, '5 7 7 5 5 6 5 6 5 5 6 6 4 5 6 7'),
 'F': (2, '6 6'),
 'G': (4, '7 7 8 6'),
 'H': (4, '7 7 7 8'),
 'I': (9, '4 4 3 4 4 4 4 4 4'),
 'J': (1, '6'),
 'K': (2, '8 6'),
 'L': (3, '6 5 5'),
 'M': (2, '8 9'),
 'N': (14, '6 7 7 7 7 7 6 8 8 8 8 9 8 9'),
 'O': (11, '7 8 8 9 9 8 9 7 7 6 9'),
 'P': (6, '6 5 6 6 7 7'),
 'Q': (1, '10'),
 'R': (14, '6 8 7 7 6 6 7 8 6 7 7 6 5 6'),
 'S': (5, '7 6 6 6 6'),
 'T': (11, '6 6 8 7 5 8 7 6 7 7 5'),
 'U': (8, '5 6 8 7 7 7 7 7'),
 'V': (1, '7'),
 'W': (2, '9 8'),
 'X': (1, '8'),
 'Y': (1, '7'),
 'Z': (1, '7')}

In [11]:
# Save a dictionary of letters to 2-tuples of the tuple of letter images and the tuple of masks.
def fn(l):
    return tuple(map(tuple, l))
d = {k: (fn(hero_letter_images[k]), fn(v)) for k, v in hero_letter_masks.items()}
with open(r"F:\Dota 2\Heroes\letters.pickle", 'bw') as fout:
    pickle.dump(d, fout)

In [12]:
# Stop here.
raise ValueError()

ValueError: 

The rest of the code is archived and not used.

In [None]:
cv2.imwrite(r"C:\Users\Chris\Pictures\tesst.png", image)

In [None]:
# Create a masking factor for each hero.  A masking factor is the stability of the pixels in an
# image.  The more stable the pixel, the higher is the value of the masking factor for that pixel.
# I tried using the variance instead of the standard deviation but it has too much noise.
directory_path = r'F:\Dota 2\Heroes\Pictures'
def fn(file_title):
    def fn(file_name: str):
        # Read the image.
        file_path = os.path.join(directory_path, file_name)
        image = cv2.imread(file_path)

        # Color data adds no value to this methodology.
        # Take the color channel with the lowest value.  This changes
        # the shape of the image from (60, 160, 3) to (60, 160).
        return image[:, :, np.argmin(np.sum(image, axis=(0, 1)))]

    # Read and process all files in the Pictures directory for the given file title.
    _, _, file_names = next(os.walk(directory_path))
    g = (file_name for file_name in file_names if file_name.startswith(file_title))
    images = list(map(fn, g))

    # Combine the list of two-dimensional tensors into a single three-dimensional tensor.
    images = np.stack(images, axis=0)

    # Determine the desired factor by scaling the range of standard deviations
    # of the sum of all values in each image from [max, min] to [0, 1].
    std = np.std(images, axis=0)
    desired_factor = (std - np.max(std)) / (np.min(std) - np.max(std))

    # Clean up the edges.
    desired_factor[-1, :] = 0

    # Apply a threshold to remove the background.
    image = (desired_factor * 255).astype(np.uint8)
    threshold, image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)
    while np.any(image[:, :16]):
        threshold *= 1.1
        print('updating', file_title, 'to', threshold)
        _, image = cv2.threshold(image, threshold, 0, cv2.THRESH_TOZERO)
    desired_factor = image.astype(np.float32) / 255
    return desired_factor

data = {s: fn(s) for s in file_titles}

show_and_wait(np.vstack(list(data.values())))
cv2.destroyAllWindows()

In [None]:
# Find the vertical extent of the text in each image.
def fn(name, image):
    a = image.astype(np.float32)
    a = np.max(a[1:], axis=1) - np.max(a[:-1], axis=1)
    return name, np.argmax(a) + 1, np.argmin(a) + 1
hero_vertical_extents = {name: (top, bottom) for name, top, bottom in it.starmap(fn, data.items())}
hero_vertical_extents
# I don't need these since my masks are all black above and below the letters.

In [None]:
# Consider the standard deviations of the values in each row.  That might provide a better idea of
# the vertical extents.
d= {k: np.std(v, axis=1) for k, v in hero_mean_images.items()}
d = {k: v < 0.0125 for k, v in d.items()}
d = {k: [i + 1 for i, b in enumerate(v[:-1] ^ v[1:]) if b] for k, v in d.items()}

# Show those values as white pixels to the right of the letters.
def fn(image, t):
    top, bottom = t
    image = image.copy()
    right = image.shape[1] // 2
    image[:top, right:] = 1.0
    image[bottom:, right:] = 1.0
    return image
image = np.vstack(list(it.starmap(fn, zip(hero_mean_images.values(), d.values()))))
show_and_wait(image)
cv2.destroyAllWindows()
{k: (16*m+v[0],16*m+v[1],''if len(v)==2 else'not 2') for m, (k, v) in enumerate(d.items())}

In [None]:
# Create a file that multiple processes can partially load.
file_path = r"D:\Dota 2\Heroes\Pickles\letter_images.pickle"
if not os.access(file_path, os.F_OK):
    with open(file_path, 'wb') as fout:
        # Write a dictionary of letter image counts.
        pickle.dump({k: len(v) for k, v, in letter_images.items()}, fout)
        # Write each letter's images in lexical order.
        for k, v in sorted(letter_images.items()):
            print(k, len(v))
            pickle.dump(tuple(v), fout)

In [None]:
# Test partial loading.
def fn():
    with open(file_path, 'rb') as fin:
        # Read the letter image count dictionary.
        d = pickle.load(fin)
        # Read a fraction of the images for each letter.
        d = {k: random.sample(pickle.load(fin), k=9999) for k in sorted(d)}
    return d
sampled_letter_images = fn()
[(k, len(v)) for k, v in sampled_letter_images.items()]

In [None]:
# Create a dictionary of letters to images for each extent extracted from each image in each video file.
# Distinguish between large and small letter renderings by using an upper-case letter for the large ones and
# a lower-case letter for the small ones.
small_letters = {
    'ancient_apparition',
    'centaur_warrunner',
    'keeper_of_the_light',
    'outworld_devourer',
}
def fn():
    file_path = r"D:\Dota 2\Heroes\Pickles\sized_letter_images.pickle"
    if os.access(file_path, os.F_OK):
        with open(file_path, 'rb') as fin:
            # Read the dictionary of letter image counts.
            d = pickle.load(fin)
            # Read each letter's images in lexical order.
            for k in sorted(d):
                d[k] = pickle.load(fin)
    else:
        # Create a file that multiple processes can partially load.
        d = collections.defaultdict(list)
        for file_title, images in data.items():
            file_extents = selected_extents[file_title]
            letters = [c if file_title in small_letters else c.upper() for c in file_title if c != '_']
            for (left, right), letter in zip(file_extents, letters):
                for image in images:
                    image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1]
                    image = image[:, left:right]
                    d[letter].append(image)
        with open(file_path, 'wb') as fout:
            # Write a dictionary of letter image counts.
            pickle.dump({k: len(v) for k, v, in d.items()}, fout)
            # Write each letter's images in lexical order.
            for k, v in sorted(d.items()):
                print(k, len(v))
                pickle.dump(tuple(v), fout)
    return d
sized_letter_images = fn()

In [None]:
print(*sorted(sized_letter_images))
def fn(fn):
    all_letters = {fn(chr(i)) for i in range(ord('A'), ord('Z') + 1)}
    given_letters = {c for c in sized_letter_images if c == fn(c)}
    missing_letters = all_letters - given_letters
    print('missing', len(missing_letters), sorted(missing_letters))
fn(str.upper)
fn(str.lower)
[(k, len(v)) for k, v in sorted(sized_letter_images.items())]

In [None]:
# Display examples of each letter.
display_examples(sized_letter_images)

In [None]:
# Select the minimum number of zeros to include images.
class Finder:
    def __init__(self, images):
        self.__nzeros = [image.sum() for image in images]
        self.__len = op.mul(*images[0].shape[:2])
        self.__n = len(images)

    def __getitem__(self, index):
        l = [nzeros for nzeros in self.__nzeros if nzeros >= index]
        return 1 if len(l) * 10 < self.__n else -1

    def __len__(self):
        return self.__len

def fn(file_title, file_images):
    print(file_title)
    file_images = [cv2.threshold(i, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] == 0 for i in file_images]
    letters = [c if file_title in small_letters else c.upper() for c in file_title if c != '_']
    def fn(t):
        index, horizontal_extent = t
        left, right = horizontal_extent
        #letter_images = [image[4:-3, left:right] for image in file_images]
        #minimum_nzero = (file_images[0].shape[0] - 7) * -op.sub(*selected_extents[file_title][1]) // 2
        letter_images = [image[:, left:right] for image in file_images]
        minimum_nzero = bisect.bisect_left(Finder(letter_images), 0)
        return index, minimum_nzero - 1
    file_extents = selected_extents[file_title]
    d = dict(map(fn, enumerate(file_extents)))
    return d
#minimum_nzeros = {k: fn(k, v) for k, v in data.items()}

In [None]:
# These are the minimum numbers of zeros created above.
minimum_nzeros = {
    'ancient_apparition': {0: 86, 1: 100, 2: 87, 3: 43, 4: 56, 5: 82, 6: 87, 7: 101, 8: 69, 9: 70, 10: 101, 11: 82, 12: 43, 13: 87, 14: 43, 15: 97, 16: 82},
    'anti-mage': {0: 114, 1: 94, 2: 101, 3: 42, 4: 62, 5: 106, 6: 111, 7: 96, 8: 68},
    'broodmother': {0: 79, 1: 79, 2: 108, 3: 110, 4: 90, 5: 106, 6: 108, 7: 101, 8: 96, 9: 68, 10: 91},
    'centaur_warrunner': {0: 87, 1: 69, 2: 99, 3: 87, 4: 70, 5: 85, 6: 83, 7: 127, 8: 70, 9: 67, 10: 67, 11: 83, 12: 82, 13: 84, 14: 69, 15: 83},
    'clinkz': {0: 99, 1: 82, 2: 52, 3: 94, 4: 95, 5: 82},
    'io': {0: 42, 1: 108},
    'juggernaut': {0: 72, 1: 96, 2: 96, 3: 98, 4: 79, 5: 79, 6: 94, 7: 114, 8: 94, 9: 101},
    'keeper_of_the_light': {0: 85, 1: 69, 2: 72, 3: 69, 4: 71, 5: 82, 6: 95, 7: 70, 8: 88, 9: 84, 10: 72, 11: 73, 12: 43, 13: 87, 14: 84, 15: 88},
    "nature's_prophet": {0: 94, 1: 98, 2: 85, 3: 80, 4: 92, 5: 79, 6: 46, 7: 83, 8: 77, 9: 80, 10: 110, 11: 83, 12: 94, 13: 79, 14: 101},
    'nyx_assassin': {0: 93, 1: 100, 2: 95, 3: 97, 4: 84, 5: 84, 6: 111, 7: 81, 8: 81, 9: 52, 10: 94},
    'outworld_devourer': {0: 99, 1: 83, 2: 87, 3: 115, 4: 97, 5: 69, 6: 70, 7: 81, 8: 81, 9: 68, 10: 88, 11: 99, 12: 83, 13: 83, 14: 70, 15: 69},
    'queen_of_pain': {0: 134, 1: 79, 2: 79, 3: 67, 4: 93, 5: 123, 6: 81, 7: 82, 8: 95, 9: 52, 10: 94},
}

In [None]:
# Inspect the images resulting from applying the minimum numbers of zeros.
def fn(file_title, index, minimum_nzeros):
    g = (image for image in data[file_title])
    g = (cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] for image in g)
    left, right = selected_extents[file_title][index]
    g = (image[:, left:right] for image in g)
    g = (image for image in g if minimum_nzeros * 1.1 > (image == 0).sum() >= minimum_nzeros)
    g = next(grouper(g, 1200))
    image = np.vstack([np.hstack(list(g)) for g in grouper(g, 60)])
    #print(file_title, index, minimum_nzeros, [c for c in file_title if c != '_'][index].upper())
    return show_and_wait(image)
g = ((s, i, m) for s in file_titles for i, m in sorted(minimum_nzeros[s].items()))
list(it.takewhile(lambda t: fn(*t) != 'q', g))
cv2.destroyAllWindows()

In [None]:
# Create a dictionary of clean letters to images for each extent extracted from each image in each video file.
# Distinguish between large and small letter renderings by using an upper-case letter for the large ones and
# a lower-case letter for the small ones.
def fn():
    file_path = r"D:\Dota 2\Heroes\Pickles\clean_letter_images.pickle"
    if os.access(file_path, os.F_OK):
        with open(file_path, 'rb') as fin:
            # Read the dictionary of letter image counts.
            d = pickle.load(fin)
            # Read each letter's images in lexical order.
            for k in sorted(d):
                d[k] = pickle.load(fin)
        print({k: len(v) for k, v, in d.items()})
    else:
        # Create a file that multiple processes can partially load.
        d = collections.defaultdict(list)
        for file_title, images in data.items():
            file_minimum_nzeros = minimum_nzeros[file_title]
            l = [cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] for image in images]
            file_extents = selected_extents[file_title]
            letters = [c if file_title in small_letters else c.upper() for c in file_title if c != '_']
            for (left, right), (index, letter) in zip(file_extents, enumerate(letters)):
                m = file_minimum_nzeros[index]
                for image in l:
                    image = image[:, left:right]
                    if m * 1.1 > (image == 0).sum() >= m:
                        d[letter].append(image)
        with open(file_path, 'wb') as fout:
            # Write a dictionary of letter image counts.
            pickle.dump({k: len(v) for k, v, in d.items()}, fout)
            # Write each letter's images in lexical order.
            for k, v in sorted(d.items()):
                print(k, len(v))
                pickle.dump(tuple(v), fout)
    return d
clean_letter_images = fn()

In [None]:
# Inspect clean letter images.
def fn():
    for letter, images in clean_letter_images.items():
        image = np.vstack([np.hstack(list(g)) for g in grouper(images[:1200], 60)])
        print(letter)
        if show_and_wait(image) == 'q':
            break
#fn()
cv2.destroyAllWindows()

Sandbox

In [None]:
# Compare training and validation.

In [None]:
horizontal_margin = 5
def put_text(background: np.ndarray, text: str, letter_images):
    image = background.copy()

    # Construct the image of the text.
    dtype, height, width = image.dtype, *image.shape
    def make_letter(letter):
        image = random.choice(letter_images[letter])
        return image
    def make_space():
        # Randomly add to the text about one space for every five characters.
        width = random.randint(7, 8) if random.random() < .167 else random.randint(0, 1)
        image = np.zeros((height, width), dtype)
        return image
    def fn():
        a = map(make_letter, text)
        b = (make_space() for _ in text)
        g = it.chain.from_iterable(zip(a, b))
        image = np.hstack(list(g)[:-1])
        if image.shape[1] < width - 2 * horizontal_margin:
            # Select a random position for the rendered text in the background.
            left = random.randrange(horizontal_margin, width - horizontal_margin - image.shape[1])
            right = width - image.shape[1] - left
            return np.hstack([np.zeros((height, left), dtype), image, np.zeros((height, right), dtype)])
    text_image = fn()
    if text_image is None:
        # The text renders too wide.  Try again.
        return

    # Apply the rendered greyscale text to the background.
    image = image * (text_image == 0) + text_image
    image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1]
    image = image * (text_image == 0) + text_image

    # Add the removed dimension.
    image = image[:, :, np.newaxis]

    return image
image = put_text(np.zeros((16, 120), np.uint8), 'ACDEF', clean_letter_images)
show_and_wait(image)
cv2.destroyAllWindows()

In [None]:
# Select 1200 of each letter.
# Manually select ranges of images to exclude.
nimages = 1200
ncolumns = 60
window_name = 'tesst'
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL | cv2.WINDOW_GUI_EXPANDED)
def fn(file_title, horizontal_extent):
    images = data[file_title]
    indices = list(range(nimages))
    left, right = horizontal_extent
    def fn():
        g = (cv2.threshold(images[i], 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1][:, left:right] for i in indices)
        g = grouper(g, ncolumns)
        return np.vstack([np.hstack(list(g)) for g in g])
    initial_index = []
    height, width = images[0].shape[0], right - left
    def handle_mouse_event(event, x, y, *_):
        if event == cv2.EVENT_LBUTTONDOWN:
            initial_index.clear()
            initial_index.append((y // height) * ncolumns + x // width)
        elif event == cv2.EVENT_RBUTTONDOWN:
            index = (y // height) * ncolumns + x // width
            begin, end = initial_index[0] if initial_index else index, index + 1
            del indices[begin:end]
            begin = indices[-1] + 1
            end = begin + nimages - len(indices)
            indices.extend(range(begin, end))
            cv2.imshow(window_name, fn())
            initial_index.clear()
    cv2.setMouseCallback(window_name, handle_mouse_event)
    show_and_wait(fn())
    return indices
#fn('io', selected_extents['io'][1])
cv2.destroyAllWindows()

In [None]:
# Select 1200 of each letter.
# Manually select the minimum number of zeros to include images.
nimages = 1200
ncolumns = 60
window_name = 'tesst'
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL | cv2.WINDOW_GUI_EXPANDED)
def fn(file_title, horizontal_extent):
    images = data[file_title]
    minimum_nzero = images[0].shape[0] * -op.sub(*selected_extents[file_title][1]) // 2
    left, right = horizontal_extent
    def fn():
        l = random.sample(images, len(images))
        g = (cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] for image in l)
        g = (image[:, left:right] for image in g)
        return [image for image in g if (image == 0).sum() >= minimum_nzero]
    while True:
        l = fn()
        if len(l) < nimages:
            image = np.zeros([1, 1], dtype=images[0].dtype)
        else:
            image = np.vstack([np.hstack(list(g)) for g in grouper(l[:nimages], ncolumns)])
        ch = show_and_wait(image)
        if ch == '-':
            minimum_nzero -= 1
        elif ch == '+':
            minimum_nzero += 1
        elif ch == '/':
            minimum_nzero -= 10
        elif ch == '*':
            minimum_nzero += 10
        elif ch == 'p':
            print(minimum_nzero, len(l))
        elif ch == 'q':
            return minimum_nzero, len(fn())
#print(fn('io', selected_extents['io'][1]))
cv2.destroyAllWindows()