In [None]:
import bisect
import collections
import cv2
import itertools as it
import numpy as np
import operator as op
import os
import pickle
import random

def show_and_wait(image):
    cv2.imshow('tesst', image)
    cv2.waitKey()

In [None]:
file_titles = [
    'Ancient Apparition',
    'Anti-Mage',
    'Broodmother',
    'Centaur Warrunner',
    'Clinkz',
    'Io',
    'Juggernaut',
    'Keeper of the Light',
    "Nature's Prophet",
    'Nyx Assassin',
    'Outworld Devourer',
    'Queen of Pain',
]
s = {c.lower() for c, _ in it.groupby(sorted(it.chain.from_iterable(file_titles)))}
print(len(file_titles), ''.join(sorted(s)), len(s))

In [None]:
directory_path = r'F:\Dota 2\Heroes\Pictures'
def fn(file_title):
    def fn(file_name: str):
        # Read the image.
        file_path = os.path.join(directory_path, file_name)
        image = cv2.imread(file_path)

        # Color data adds no value to this methodology.
        # Take the color channel with the lowest value.  This changes
        # the shape of the image from (60, 160, 3) to (60, 160).
        return image[:, :, np.argmin(np.sum(image, axis=(0, 1)))]

    # Read and process all files in the Pictures directory for the given file title.
    _, _, file_names = next(os.walk(directory_path))
    g = (file_name for file_name in file_names if file_name.startswith(file_title))
    images = list(map(fn, g))

    # Combine the list of two-dimensional tensors into a single three-dimensional tensor.
    images = np.stack(images, axis=0)

    # Determine the desired factor by scaling the range of standard deviations
    # of the sum of all values in each image from [max, min] to [0, 1].
    std = np.std(images, axis=0)
    desired_factor = (std - np.max(std)) / (np.min(std) - np.max(std))

    # Clean up the edges.
    desired_factor[-1, :] = 0

    # Apply a threshold to remove the background.
    image = (desired_factor * 255).astype(np.uint8)
    threshold, image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)
    while np.any(image[:, :16]):
        threshold *= 1.1
        print('updating', file_title, 'to', threshold)
        _, image = cv2.threshold(image, threshold, 0, cv2.THRESH_TOZERO)
    desired_factor = image.astype(np.float32) / 255
    return desired_factor

data = {s: fn(s) for s in file_titles}

In [None]:
show_and_wait(np.vstack(list(data.values())))
cv2.destroyAllWindows()

In [None]:
# Find the vertical extent of the text in each image.
def fn(name, image):
    a = image.astype(np.float32)
    a = np.max(a[1:], axis=1) - np.max(a[:-1], axis=1)
    return name, np.argmax(a) + 1, np.argmin(a) + 1
vertical_extents = {name: (top, bottom) for name, top, bottom in it.starmap(fn, data.items())}
vertical_extents
# I don't need these since my masks are all black above and below the letters.

In [None]:
# Find the horizontal extent of each letter in each image.
def fn(name, image):
    # Count the number of non-space characters.
    n = len([c for c in name if c != ' '])

    # Compute the max along the vertical axis to get a brightness profile
    image = np.max(image, axis=0)

    # Loop by a small increment and determine the number of crossings across the limit.
    max_limit = np.max(image)
    g = (max_limit * i / 999 for i in range(1000))
    def fn(limit):
        a = image < limit
        a = a[1:] ^ a[:-1]
        return np.sum(a) // 2
    g = it.dropwhile(lambda t: t[1] < n, enumerate(map(fn, g)))
    limit = max_limit * next(g)[0] / 999

    # Collect the indices of where the image value crosses above then below the limit.
    # These constitute the left and right edges of the letters.
    edge = 'left'
    l = []
    for i, v in enumerate(image):
        if edge == 'left' and v >= limit:
            # Take a one-pixel margin to the left.
            l.append(i - 1)
            edge = 'right'
        elif edge == 'right' and v < limit:
            # Take a one-pixel margin to the right.
            l.append(i + 1)
            edge = 'left'
    if len(l) % 2:
        raise AssertionError()
    g = iter(l)
    return name, list(zip(g, g))
horizontal_extents = {name: l for name, l in it.starmap(fn, data.items())}
{k: ' '.join(map(str, v)) for k, v in horizontal_extents.items()}

In [None]:
# Extract the letter images for each Hero.
def fn(name):
    def fn(letter):
        # Due to taking additional one-pixel margins to the left and right, ensure the left- and
        # right-most columns have values no greater than the columns next to them.
        letter[:, 0] = np.min(letter[:, :2], axis=1)
        letter[:, -1] = np.min(letter[:, -2:], axis=1)
        return letter
    image = data[name]
    letters = [fn(image[:, left:right]) for left, right in horizontal_extents[name]]
    return letters
hero_letter_images = {name: fn(name) for name in horizontal_extents}
max_width = max(np.hstack(letters).shape[1] for letters in hero_letter_images.values())
l = [np.hstack(letters + [np.zeros((letters[0].shape[0], max_width - np.hstack(letters).shape[1]))]) for letters in hero_letter_images.values()]
show_and_wait(np.vstack(l))
cv2.destroyAllWindows()
{k: ' '.join([str(i.shape) for i in v]) for k, v in hero_letter_images.items()}

In [None]:
# Create a dictionary of letters to tuples of letter images.
def fn(k, v):
    g = (c.upper() for c in k if c != ' ')
    #return zip(g, [type(v) for v in v])
    return zip(g, v)
l = sorted(it.chain.from_iterable(fn(k, v) for k, v in hero_letter_images.items()), key=lambda t: t[0])
g = it.groupby(l, lambda t: t[0])
letter_images = {c: [v for _, v in g] for c, g in g}
{c: [type(v) for v in l] for c, l in letter_images.items()}

In [None]:
# Construct and display some random text.
image = next(iter(data.values()))[0]
dtype, height, width = image.dtype, *image.shape
def make_letter():
    file_title, extents = random.choice(tuple(selected_extents.items()))
    left, right = random.choice(extents)
    image = random.choice(data[file_title])
    image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1]
    image = image[:, left:right]
    return image
def make_space():
    width = random.randint(0, 1) if random.random() < .8 else random.randint(7, 8)
    image = np.zeros((height, width), dtype=dtype)
    return image
def fn():
    a = (make_letter() for _ in it.repeat(None))
    b = (make_space() for _ in it.repeat(None))
    g = it.chain.from_iterable(zip(a, b))
    n = random.randint(2, 19) * 2 - 1
    g = (image for _, image in zip(range(n), g))
    image = np.hstack(list(g))
    if image.shape[1] < width:
        image = np.hstack([image, np.zeros((height, width - image.shape[1]))])
    else:
        image = image[:, :width]
    return image
show_and_wait(np.vstack([fn() for _ in range(9)]))
cv2.destroyAllWindows()

In [None]:
def fn():
    file_path = r"D:\Dota 2\Heroes\Pickles\letter_image_dict.pickle"
    if os.access(file_path, os.F_OK):
        # Load the previously saved dictionary.
        with open(file_path, 'rb') as fin:
            d = pickle.load(fin)
    else:
        # Create a dictionary of letters to images for each extent extracted from each image in the data dictionary.
        d = collections.defaultdict(list)
        for file_title, images in data.items():
            file_extents = selected_extents[file_title]
            letters = [c for c in file_title if c != '_']
            for (left, right), letter in zip(file_extents, letters):
                for image in images:
                    image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1]
                    image = image[:, left:right]
                    d[letter.upper()].append(image)
        # Save that dictionary.
        with open(file_path, 'wb') as fout:
            pickle.dump(letter_images, fout)
    return d
letter_images = fn()

In [None]:
print(len(letter_images), *letter_images)
l = [(k, len(v)) for k, v in letter_images.items()]
print(sum(n for _, n in l))
print(*l, sep='\n')

In [None]:
# Display examples of each letter.
def display_examples(d):
    def fn(c):
        width = 11 + max(np.hstack(list(g)).shape[1] for g in grouper(random.sample(d[c], k=2000), 50))
        def fn(g):
            image = np.hstack(list(g))
            image = np.hstack([image, np.zeros((image.shape[0], width - image.shape[1]))])
            return image
        return show_and_wait(np.vstack([fn(g) for g in grouper(random.sample(d[c], k=2000), 50)]))
    g = map(fn, sorted(d))
    _ = list(it.takewhile(lambda c: c != 'q', g))
    cv2.destroyAllWindows()

In [None]:
display_examples(letter_images)

In [None]:
# Create a file that multiple processes can partially load.
file_path = r"D:\Dota 2\Heroes\Pickles\letter_images.pickle"
if not os.access(file_path, os.F_OK):
    with open(file_path, 'wb') as fout:
        # Write a dictionary of letter image counts.
        pickle.dump({k: len(v) for k, v, in letter_images.items()}, fout)
        # Write each letter's images in lexical order.
        for k, v in sorted(letter_images.items()):
            print(k, len(v))
            pickle.dump(tuple(v), fout)

In [None]:
# Test partial loading.
def fn():
    with open(file_path, 'rb') as fin:
        # Read the letter image count dictionary.
        d = pickle.load(fin)
        # Read a fraction of the images for each letter.
        d = {k: random.sample(pickle.load(fin), k=9999) for k in sorted(d)}
    return d
sampled_letter_images = fn()
[(k, len(v)) for k, v in sampled_letter_images.items()]

In [None]:
# Display examples of each letter.
display_examples(sampled_letter_images)

In [None]:
# Create a dictionary of letters to images for each extent extracted from each image in each video file.
# Distinguish between large and small letter renderings by using an upper-case letter for the large ones and
# a lower-case letter for the small ones.
small_letters = {
    'ancient_apparition',
    'centaur_warrunner',
    'keeper_of_the_light',
    'outworld_devourer',
}
def fn():
    file_path = r"D:\Dota 2\Heroes\Pickles\sized_letter_images.pickle"
    if os.access(file_path, os.F_OK):
        with open(file_path, 'rb') as fin:
            # Read the dictionary of letter image counts.
            d = pickle.load(fin)
            # Read each letter's images in lexical order.
            for k in sorted(d):
                d[k] = pickle.load(fin)
    else:
        # Create a file that multiple processes can partially load.
        d = collections.defaultdict(list)
        for file_title, images in data.items():
            file_extents = selected_extents[file_title]
            letters = [c if file_title in small_letters else c.upper() for c in file_title if c != '_']
            for (left, right), letter in zip(file_extents, letters):
                for image in images:
                    image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1]
                    image = image[:, left:right]
                    d[letter].append(image)
        with open(file_path, 'wb') as fout:
            # Write a dictionary of letter image counts.
            pickle.dump({k: len(v) for k, v, in d.items()}, fout)
            # Write each letter's images in lexical order.
            for k, v in sorted(d.items()):
                print(k, len(v))
                pickle.dump(tuple(v), fout)
    return d
sized_letter_images = fn()

In [None]:
print(*sorted(sized_letter_images))
def fn(fn):
    all_letters = {fn(chr(i)) for i in range(ord('A'), ord('Z') + 1)}
    given_letters = {c for c in sized_letter_images if c == fn(c)}
    missing_letters = all_letters - given_letters
    print('missing', len(missing_letters), sorted(missing_letters))
fn(str.upper)
fn(str.lower)
[(k, len(v)) for k, v in sorted(sized_letter_images.items())]

In [None]:
# Display examples of each letter.
display_examples(sized_letter_images)

In [None]:
# Select the minimum number of zeros to include images.
class Finder:
    def __init__(self, images):
        self.__nzeros = [image.sum() for image in images]
        self.__len = op.mul(*images[0].shape[:2])
        self.__n = len(images)

    def __getitem__(self, index):
        l = [nzeros for nzeros in self.__nzeros if nzeros >= index]
        return 1 if len(l) * 10 < self.__n else -1

    def __len__(self):
        return self.__len

def fn(file_title, file_images):
    print(file_title)
    file_images = [cv2.threshold(i, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] == 0 for i in file_images]
    letters = [c if file_title in small_letters else c.upper() for c in file_title if c != '_']
    def fn(t):
        index, horizontal_extent = t
        left, right = horizontal_extent
        #letter_images = [image[4:-3, left:right] for image in file_images]
        #minimum_nzero = (file_images[0].shape[0] - 7) * -op.sub(*selected_extents[file_title][1]) // 2
        letter_images = [image[:, left:right] for image in file_images]
        minimum_nzero = bisect.bisect_left(Finder(letter_images), 0)
        return index, minimum_nzero - 1
    file_extents = selected_extents[file_title]
    d = dict(map(fn, enumerate(file_extents)))
    return d
#minimum_nzeros = {k: fn(k, v) for k, v in data.items()}

In [None]:
# These are the minimum numbers of zeros created above.
minimum_nzeros = {
    'ancient_apparition': {0: 86, 1: 100, 2: 87, 3: 43, 4: 56, 5: 82, 6: 87, 7: 101, 8: 69, 9: 70, 10: 101, 11: 82, 12: 43, 13: 87, 14: 43, 15: 97, 16: 82},
    'anti-mage': {0: 114, 1: 94, 2: 101, 3: 42, 4: 62, 5: 106, 6: 111, 7: 96, 8: 68},
    'broodmother': {0: 79, 1: 79, 2: 108, 3: 110, 4: 90, 5: 106, 6: 108, 7: 101, 8: 96, 9: 68, 10: 91},
    'centaur_warrunner': {0: 87, 1: 69, 2: 99, 3: 87, 4: 70, 5: 85, 6: 83, 7: 127, 8: 70, 9: 67, 10: 67, 11: 83, 12: 82, 13: 84, 14: 69, 15: 83},
    'clinkz': {0: 99, 1: 82, 2: 52, 3: 94, 4: 95, 5: 82},
    'io': {0: 42, 1: 108},
    'juggernaut': {0: 72, 1: 96, 2: 96, 3: 98, 4: 79, 5: 79, 6: 94, 7: 114, 8: 94, 9: 101},
    'keeper_of_the_light': {0: 85, 1: 69, 2: 72, 3: 69, 4: 71, 5: 82, 6: 95, 7: 70, 8: 88, 9: 84, 10: 72, 11: 73, 12: 43, 13: 87, 14: 84, 15: 88},
    "nature's_prophet": {0: 94, 1: 98, 2: 85, 3: 80, 4: 92, 5: 79, 6: 46, 7: 83, 8: 77, 9: 80, 10: 110, 11: 83, 12: 94, 13: 79, 14: 101},
    'nyx_assassin': {0: 93, 1: 100, 2: 95, 3: 97, 4: 84, 5: 84, 6: 111, 7: 81, 8: 81, 9: 52, 10: 94},
    'outworld_devourer': {0: 99, 1: 83, 2: 87, 3: 115, 4: 97, 5: 69, 6: 70, 7: 81, 8: 81, 9: 68, 10: 88, 11: 99, 12: 83, 13: 83, 14: 70, 15: 69},
    'queen_of_pain': {0: 134, 1: 79, 2: 79, 3: 67, 4: 93, 5: 123, 6: 81, 7: 82, 8: 95, 9: 52, 10: 94},
}

In [None]:
# Inspect the images resulting from applying the minimum numbers of zeros.
def fn(file_title, index, minimum_nzeros):
    g = (image for image in data[file_title])
    g = (cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] for image in g)
    left, right = selected_extents[file_title][index]
    g = (image[:, left:right] for image in g)
    g = (image for image in g if minimum_nzeros * 1.1 > (image == 0).sum() >= minimum_nzeros)
    g = next(grouper(g, 1200))
    image = np.vstack([np.hstack(list(g)) for g in grouper(g, 60)])
    #print(file_title, index, minimum_nzeros, [c for c in file_title if c != '_'][index].upper())
    return show_and_wait(image)
g = ((s, i, m) for s in file_titles for i, m in sorted(minimum_nzeros[s].items()))
list(it.takewhile(lambda t: fn(*t) != 'q', g))
cv2.destroyAllWindows()

In [None]:
# Create a dictionary of clean letters to images for each extent extracted from each image in each video file.
# Distinguish between large and small letter renderings by using an upper-case letter for the large ones and
# a lower-case letter for the small ones.
def fn():
    file_path = r"D:\Dota 2\Heroes\Pickles\clean_letter_images.pickle"
    if os.access(file_path, os.F_OK):
        with open(file_path, 'rb') as fin:
            # Read the dictionary of letter image counts.
            d = pickle.load(fin)
            # Read each letter's images in lexical order.
            for k in sorted(d):
                d[k] = pickle.load(fin)
        print({k: len(v) for k, v, in d.items()})
    else:
        # Create a file that multiple processes can partially load.
        d = collections.defaultdict(list)
        for file_title, images in data.items():
            file_minimum_nzeros = minimum_nzeros[file_title]
            l = [cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] for image in images]
            file_extents = selected_extents[file_title]
            letters = [c if file_title in small_letters else c.upper() for c in file_title if c != '_']
            for (left, right), (index, letter) in zip(file_extents, enumerate(letters)):
                m = file_minimum_nzeros[index]
                for image in l:
                    image = image[:, left:right]
                    if m * 1.1 > (image == 0).sum() >= m:
                        d[letter].append(image)
        with open(file_path, 'wb') as fout:
            # Write a dictionary of letter image counts.
            pickle.dump({k: len(v) for k, v, in d.items()}, fout)
            # Write each letter's images in lexical order.
            for k, v in sorted(d.items()):
                print(k, len(v))
                pickle.dump(tuple(v), fout)
    return d
clean_letter_images = fn()

In [None]:
# Inspect clean letter images.
def fn():
    for letter, images in clean_letter_images.items():
        image = np.vstack([np.hstack(list(g)) for g in grouper(images[:1200], 60)])
        print(letter)
        if show_and_wait(image) == 'q':
            break
#fn()
cv2.destroyAllWindows()

Sandbox

In [None]:
# Compare training and validation.

In [None]:
horizontal_margin = 5
def put_text(background: np.ndarray, text: str, letter_images):
    image = background.copy()

    # Construct the image of the text.
    dtype, height, width = image.dtype, *image.shape
    def make_letter(letter):
        image = random.choice(letter_images[letter])
        return image
    def make_space():
        # Randomly add to the text about one space for every five characters.
        width = random.randint(7, 8) if random.random() < .167 else random.randint(0, 1)
        image = np.zeros((height, width), dtype)
        return image
    def fn():
        a = map(make_letter, text)
        b = (make_space() for _ in text)
        g = it.chain.from_iterable(zip(a, b))
        image = np.hstack(list(g)[:-1])
        if image.shape[1] < width - 2 * horizontal_margin:
            # Select a random position for the rendered text in the background.
            left = random.randrange(horizontal_margin, width - horizontal_margin - image.shape[1])
            right = width - image.shape[1] - left
            return np.hstack([np.zeros((height, left), dtype), image, np.zeros((height, right), dtype)])
    text_image = fn()
    if text_image is None:
        # The text renders too wide.  Try again.
        return

    # Apply the rendered greyscale text to the background.
    image = image * (text_image == 0) + text_image
    image = cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1]
    image = image * (text_image == 0) + text_image

    # Add the removed dimension.
    image = image[:, :, np.newaxis]

    return image
image = put_text(np.zeros((16, 120), np.uint8), 'ACDEF', clean_letter_images)
show_and_wait(image)
cv2.destroyAllWindows()

The rest of the code is archived and not used.

In [None]:
# Select 1200 of each letter.
# Manually select ranges of images to exclude.
nimages = 1200
ncolumns = 60
window_name = 'tesst'
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL | cv2.WINDOW_GUI_EXPANDED)
def fn(file_title, horizontal_extent):
    images = data[file_title]
    indices = list(range(nimages))
    left, right = horizontal_extent
    def fn():
        g = (cv2.threshold(images[i], 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1][:, left:right] for i in indices)
        g = grouper(g, ncolumns)
        return np.vstack([np.hstack(list(g)) for g in g])
    initial_index = []
    height, width = images[0].shape[0], right - left
    def handle_mouse_event(event, x, y, *_):
        if event == cv2.EVENT_LBUTTONDOWN:
            initial_index.clear()
            initial_index.append((y // height) * ncolumns + x // width)
        elif event == cv2.EVENT_RBUTTONDOWN:
            index = (y // height) * ncolumns + x // width
            begin, end = initial_index[0] if initial_index else index, index + 1
            del indices[begin:end]
            begin = indices[-1] + 1
            end = begin + nimages - len(indices)
            indices.extend(range(begin, end))
            cv2.imshow(window_name, fn())
            initial_index.clear()
    cv2.setMouseCallback(window_name, handle_mouse_event)
    show_and_wait(fn())
    return indices
#fn('io', selected_extents['io'][1])
cv2.destroyAllWindows()

In [None]:
# Select 1200 of each letter.
# Manually select the minimum number of zeros to include images.
nimages = 1200
ncolumns = 60
window_name = 'tesst'
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL | cv2.WINDOW_GUI_EXPANDED)
def fn(file_title, horizontal_extent):
    images = data[file_title]
    minimum_nzero = images[0].shape[0] * -op.sub(*selected_extents[file_title][1]) // 2
    left, right = horizontal_extent
    def fn():
        l = random.sample(images, len(images))
        g = (cv2.threshold(image, 0, 0, cv2.THRESH_TOZERO | cv2.THRESH_OTSU)[1] for image in l)
        g = (image[:, left:right] for image in g)
        return [image for image in g if (image == 0).sum() >= minimum_nzero]
    while True:
        l = fn()
        if len(l) < nimages:
            image = np.zeros([1, 1], dtype=images[0].dtype)
        else:
            image = np.vstack([np.hstack(list(g)) for g in grouper(l[:nimages], ncolumns)])
        ch = show_and_wait(image)
        if ch == '-':
            minimum_nzero -= 1
        elif ch == '+':
            minimum_nzero += 1
        elif ch == '/':
            minimum_nzero -= 10
        elif ch == '*':
            minimum_nzero += 10
        elif ch == 'p':
            print(minimum_nzero, len(l))
        elif ch == 'q':
            return minimum_nzero, len(fn())
#print(fn('io', selected_extents['io'][1]))
cv2.destroyAllWindows()