In [1]:
import numpy as np
import re
from collections import Counter

In [2]:
fulltext = list(np.load('data//numpy//fullstrings.npy'))
titles = list(np.load('data//numpy//titles.npy'))
c2i = np.load('data//numpy//c2i.npy').item()
i2c = np.load('data//numpy//i2c.npy').item()

## generate batches



In [3]:
def cardGenerator(cardtext, title, windowsize, c2i=c2i, debug=False):
        
    i = 0
    indices = list(np.random.permutation(len(cardtext)))
    idx = indices[i]

    # pregenerate warmup sequence
    warmup = ''
    while len(warmup) < windowsize:
        warmup += title[idx]
    
    warmup = list(warmup[-windowsize:])
    
    # sequence start
    sequence = list(cardtext[idx])
    
    # create matrix
    x = []
    y = []
    
    sequence = warmup + sequence
    
    if debug:
        sequence = sequence
    else:
        sequence = [c2i[c] for c in sequence]
    
    # main iterator
    while True:
        
        # generate batch (of cards_per_batch cards)
        while len(sequence) > windowsize:
            x.append(np.array(sequence[:windowsize]))
            y.append(sequence[windowsize])
            sequence.pop(0)
        
        # generate batch_size worth of window-shifted data
        # reshape for sparse_categorical_crossentropy
        sequence = []
        y = np.array(y)
        y = y[:, np.newaxis]
        # yield and reset
        yield(np.asarray(x), y)
        x, y = [], []
        
        # check for too long, reset
        if i+1 > len(indices):
            indices = np.random.permutation(len(cardtext))
            i = 0
            idx = indices[i]
        else:
            i += 1
            idx = indices[i]
            
        # pregenerate new warmup sequence
        warmup = ''
        while len(warmup) < windowsize:
            warmup += title[idx]

        warmup = list(warmup[-windowsize:])

        # sequence start
        sequence = list(cardtext[idx])

        # create matrix
        x = []
        y = []

        sequence = warmup + sequence

        if debug:
            sequence = sequence
        else:
            sequence = [c2i[c] for c in sequence]

In [4]:
# stress test : batch of 5000 x 2000 = over all cards at least once
# itr = cardGenerator(cardtext, 5, 5000, debug=True)
# for i in range(2000):
#     x, y = next(itr)

In [5]:
itr = cardGenerator(fulltext, titles, 10, debug=True)

In [6]:
x, y = next(itr)
print(x.shape, y.shape)
for z in range(len(x)):
    print(x[z], y[z])

(1206, 10) (1206, 1)
['v' 'e' 'r' 's' 'a' 't' 'i' 'o' 'n' 's'] ['Ⓣ']
['e' 'r' 's' 'a' 't' 'i' 'o' 'n' 's' 'Ⓣ'] ['a']
['r' 's' 'a' 't' 'i' 'o' 'n' 's' 'Ⓣ' 'a'] [' ']
['s' 'a' 't' 'i' 'o' 'n' 's' 'Ⓣ' 'a' ' '] ['t']
['a' 't' 'i' 'o' 'n' 's' 'Ⓣ' 'a' ' ' 't'] ['a']
['t' 'i' 'o' 'n' 's' 'Ⓣ' 'a' ' ' 't' 'a'] ['x']
['i' 'o' 'n' 's' 'Ⓣ' 'a' ' ' 't' 'a' 'x'] ['o']
['o' 'n' 's' 'Ⓣ' 'a' ' ' 't' 'a' 'x' 'o'] ['n']
['n' 's' 'Ⓣ' 'a' ' ' 't' 'a' 'x' 'o' 'n'] ['o']
['s' 'Ⓣ' 'a' ' ' 't' 'a' 'x' 'o' 'n' 'o'] ['m']
['Ⓣ' 'a' ' ' 't' 'a' 'x' 'o' 'n' 'o' 'm'] ['y']
['a' ' ' 't' 'a' 'x' 'o' 'n' 'o' 'm' 'y'] [' ']
[' ' 't' 'a' 'x' 'o' 'n' 'o' 'm' 'y' ' '] ['o']
['t' 'a' 'x' 'o' 'n' 'o' 'm' 'y' ' ' 'o'] ['f']
['a' 'x' 'o' 'n' 'o' 'm' 'y' ' ' 'o' 'f'] [' ']
['x' 'o' 'n' 'o' 'm' 'y' ' ' 'o' 'f' ' '] ['i']
['o' 'n' 'o' 'm' 'y' ' ' 'o' 'f' ' ' 'i'] ['n']
['n' 'o' 'm' 'y' ' ' 'o' 'f' ' ' 'i' 'n'] ['s']
['o' 'm' 'y' ' ' 'o' 'f' ' ' 'i' 'n' 's'] ['t']
['m' 'y' ' ' 'o' 'f' ' ' 'i' 'n' 's' 't'] ['r']
['y' ' ' 'o' 'f' ' 

['e' 'g' 'o' 'r' 'i' 'e' 's' ',' ' ' 'a'] ['n']
['g' 'o' 'r' 'i' 'e' 's' ',' ' ' 'a' 'n'] ['d']
['o' 'r' 'i' 'e' 's' ',' ' ' 'a' 'n' 'd'] [' ']
['r' 'i' 'e' 's' ',' ' ' 'a' 'n' 'd' ' '] ['r']
['i' 'e' 's' ',' ' ' 'a' 'n' 'd' ' ' 'r'] ['e']
['e' 's' ',' ' ' 'a' 'n' 'd' ' ' 'r' 'e'] ['p']
['s' ',' ' ' 'a' 'n' 'd' ' ' 'r' 'e' 'p'] ['o']
[',' ' ' 'a' 'n' 'd' ' ' 'r' 'e' 'p' 'o'] ['r']
[' ' 'a' 'n' 'd' ' ' 'r' 'e' 'p' 'o' 'r'] ['t']
['a' 'n' 'd' ' ' 'r' 'e' 'p' 'o' 'r' 't'] [' ']
['n' 'd' ' ' 'r' 'e' 'p' 'o' 'r' 't' ' '] ['t']
['d' ' ' 'r' 'e' 'p' 'o' 'r' 't' ' ' 't'] ['h']
[' ' 'r' 'e' 'p' 'o' 'r' 't' ' ' 't' 'h'] ['e']
['r' 'e' 'p' 'o' 'r' 't' ' ' 't' 'h' 'e'] [' ']
['e' 'p' 'o' 'r' 't' ' ' 't' 'h' 'e' ' '] ['o']
['p' 'o' 'r' 't' ' ' 't' 'h' 'e' ' ' 'o'] ['v']
['o' 'r' 't' ' ' 't' 'h' 'e' ' ' 'o' 'v'] ['e']
['r' 't' ' ' 't' 'h' 'e' ' ' 'o' 'v' 'e'] ['r']
['t' ' ' 't' 'h' 'e' ' ' 'o' 'v' 'e' 'r'] ['a']
[' ' 't' 'h' 'e' ' ' 'o' 'v' 'e' 'r' 'a'] ['l']
['t' 'h' 'e' ' ' 'o' 'v' 'e' 'r' 'a' 'l'

to-do, network notes:

create window-size, pad with START tokens
add EOF token (and remove power/toughness if null?)
in network, encode name with LSTM(s) then copy state(s) to output network
(effectively a seq-to-seq)
generator needs to PAD TO FIXED BATCH SIZE (see length chosen above)
use batch-training and reset_state