# data formatting

here we convert the card info into sequences of characters to train our model

In [1]:
import numpy as np
import re
from collections import Counter

In [2]:
setnm = list(np.load('data//numpy//setnm.npy'))
idnum = list(np.load('data//numpy//idnum.npy'))
names = list(np.load('data//numpy//names.npy'))
mcost = list(np.load('data//numpy//mcost.npy'))
rarty = list(np.load('data//numpy//rarty.npy'))
types = list(np.load('data//numpy//types.npy'))
subtp = list(np.load('data//numpy//subtp.npy'))
ctext = list(np.load('data//numpy//ctext_orig.npy'))
power = list(np.load('data//numpy//power.npy'))
tough = list(np.load('data//numpy//tough.npy'))

### encode the rarities as capital letters

In [3]:
rletr = [w[0] for w in rarty]
list(set(rletr))

['S', 'M', 'U', 'C', 'R']

### encode the types, subtypes as lower-cased words separated by a symbol '·'

In [4]:
typss = []
for x in types:
    item = ''
    lst = str(x).split(' ')
    for idx, tp in enumerate(lst):
        item += tp.lower()
        if idx < len(lst)-1:
            item += '·'
    
    typss.append(item)    

In [5]:
set(typss)

{'artifact',
 'artifact·creature',
 'artifact·host·creature',
 'creature',
 'enchantment',
 'enchantment·artifact',
 'enchantment·creature',
 'host·creature',
 'instant',
 'planeswalker',
 'sorcery',
 'tribal·artifact',
 'tribal·enchantment',
 'tribal·instant',
 'tribal·sorcery'}

In [6]:
subts = []
for lst in subtp:
    item = ''
    lst = list(lst)
    for idx, subtype in enumerate(lst):
        if subtype != 'PAD':
            item += subtype.lower()
        if idx < len(lst)-1:
            if lst[idx+1] != 'PAD':
                item += '·'
    
    if item == '':
        item = '⌧'
    
    subts.append(item)    

### remove reminder text
strip all text between parentheses

In [7]:
ctext[:10]

['first strike (this creature deals combat damage before creatures without first strike.)|when Ⓝ enters the battlefield, you gain 1 life for each card in your graveyard.',
 'flying|when Ⓝ enters the battlefield, you gain 3 life.',
 "target creature gets +3/+3 and gains flying until end of turn. (it can't be blocked except by creatures with flying or reach.)",
 'whenever a creature enters the battlefield under your control, you gain life equal to its toughness.',
 "defender (this creature can't attack.)|flying",
 'artifact and enchantment spells your opponents cast cost ② more to cast.|sacrifice Ⓝ: destroy target artifact or enchantment.',
 "flying (this creature can't be blocked except by creatures with flying or reach.)|when Ⓝ enters the battlefield, destroy target enchantment.",
 'ⓍⓌ, ↷: Ⓝ deals x damage to target attacking or blocking creature.',
 'prevent the next 1 damage that would be dealt to any target this turn.|draw a card.',
 "double target player's life total. shuffle Ⓝ int

In [8]:
ctext_new = []
for text in ctext:
    new_text = re.sub(r'\s\([\S\s]+\)', '', text)
    new_text = new_text.replace('|', '·')
    ctext_new.append(new_text)

In [9]:
ctext_new[:10]

['first strike·when Ⓝ enters the battlefield, you gain 1 life for each card in your graveyard.',
 'flying·when Ⓝ enters the battlefield, you gain 3 life.',
 'target creature gets +3/+3 and gains flying until end of turn.',
 'whenever a creature enters the battlefield under your control, you gain life equal to its toughness.',
 'defender·flying',
 'artifact and enchantment spells your opponents cast cost ② more to cast.·sacrifice Ⓝ: destroy target artifact or enchantment.',
 'flying·when Ⓝ enters the battlefield, destroy target enchantment.',
 'ⓍⓌ, ↷: Ⓝ deals x damage to target attacking or blocking creature.',
 'prevent the next 1 damage that would be dealt to any target this turn.·draw a card.',
 "double target player's life total. shuffle Ⓝ into its owner's library."]

### combine items into string with '|' separator

`[mcost|rletr|typss|subts|power|tough|ctext_new]`  
don't add power/tough if not needed  
use START symbol: Ⓢ and EOF symbol: Ⓔ  

In [10]:
cardtext = []
for i in range(len(names)):
    lst = [names[i], mcost[i], rletr[i], typss[i]]
    # only add subtype if not empty
    if subts[i] != '⌧':
        lst += [subts[i]]
    # add power-toughness before cardtext
    if power[i] != '⌧' and tough[i] != '⌧':
        lst += [power[i], tough[i]]
    lst += [ctext_new[i]]
    
    txt = '|'.join(lst)
    txt += 'Ⓔ'
    cardtext.append('Ⓢ'+txt)

In [11]:
for i in range(10):
    print(cardtext[i])

Ⓢancestor's chosen|⑤ⓌⓌ|U|creature|human·cleric|4|4|first strike·when Ⓝ enters the battlefield, you gain 1 life for each card in your graveyard.Ⓔ
Ⓢangel of mercy|④Ⓦ|U|creature|angel|3|3|flying·when Ⓝ enters the battlefield, you gain 3 life.Ⓔ
Ⓢangelic blessing|②Ⓦ|C|sorcery|target creature gets +3/+3 and gains flying until end of turn.Ⓔ
Ⓢangelic chorus|③ⓌⓌ|R|enchantment|whenever a creature enters the battlefield under your control, you gain life equal to its toughness.Ⓔ
Ⓢangelic wall|①Ⓦ|C|creature|wall|0|4|defender·flyingⒺ
Ⓢaura of silence|①ⓌⓌ|U|enchantment|artifact and enchantment spells your opponents cast cost ② more to cast.·sacrifice Ⓝ: destroy target artifact or enchantment.Ⓔ
Ⓢaven cloudchaser|③Ⓦ|C|creature|bird·soldier|2|2|flying·when Ⓝ enters the battlefield, destroy target enchantment.Ⓔ
Ⓢballista squad|③Ⓦ|U|creature|human·rebel|2|2|ⓍⓌ, ↷: Ⓝ deals x damage to target attacking or blocking creature.Ⓔ
Ⓢbandage|Ⓦ|C|instant|prevent the next 1 damage that would be dealt to any target th

### save cardtexts

In [12]:
np.save('data/card_texts.npy', cardtext)

### analysis & testing

In [13]:
txtlens = [len(s) for s in cardtext]

In [14]:
lencnts = Counter(txtlens)
zips = lencnts.most_common()
x = sorted(zips, key = lambda x: x[0], reverse=True)

### determine a good max sequence length

due to the `keras` fixed-graph structure, we need to set all inputs to the same length.

In [16]:
idx = 256
over = sum([t[1] for t in x if t[0] > idx])
undr = sum([t[1] for t in x if t[0] <= idx])
print("% of cards over chosen idx:", over/(over+undr))

% of cards over chosen idx: 0.05520905520905521


In [17]:
print(cardtext[txtlens.index(max(txtlens))])

Ⓢdance of the dead|①Ⓑ|U|enchantment|aura|enchant creature card in a graveyard·when Ⓝ enters the battlefield, if it's on the battlefield, it loses "enchant creature card in a graveyard" and gains "enchant creature put onto the battlefield with Ⓝ." put enchanted creature card onto the battlefield tapped under your control and attach Ⓝ to it. when Ⓝ leaves the battlefield, that creature's controller sacrifices it.·enchanted creature gets +1/+1 and doesn't untap during its controller's untap step.·at the beginning of the upkeep of enchanted creature's controller, that player may pay ①Ⓑ. if the player does, untap that creature.Ⓔ


## vocabulary

START: Ⓢ  
EOS/PAD: Ⓔ  

we will use the EOS symbol as the padding symbol as well because recycling

In [18]:
# first get vocabulary
vocab = []
vocabset = ['Ⓔ'] # zero-pad PAD character
for card in cardtext:
    lst = list(card)
    vocab += lst

In [19]:
# add vocab according to frequency
vocabset += [x[0] for x in sorted([t for t in Counter(vocab).most_common() if t[0] != 'Ⓔ'], key=lambda x: x[1], reverse=True)]
vocabset.append('Ⓢ') # START-PAD character

In [20]:
c2i = dict([(c, i) for i, c in enumerate(vocabset)])
i2c = dict([(i, c) for i, c in enumerate(vocabset)])

In [21]:
c2i['Ⓔ'], c2i[' '], i2c[0], i2c[max(i2c.keys())]

(0, 1, 'Ⓔ', 'Ⓢ')

In [22]:
# save
np.save('data/c2i.npy', c2i)
np.save('data/i2c.npy', i2c)

## encode

here we encode the inputs and outputs for training. the output will be the input, offset by one, such that it is the 'next' symbol:

```
# replacing start with $ and end with & for spacing

in : $  H  E  L  L  O  _  W  O  R  L  D  &  &
out: H  E  L  L  O  _  W  O  R  L  D  &  &  &
```

this says, "when you see the start symbol, predict an 'H', and then when you see an 'H', predict an 'E' etc. etc.

In [23]:
def encode(lol, maxlen=256):
    x, y = [], []
    for l in lol:
        l = [c2i[c] for c in l]
        l = l[:maxlen]
        while len(l) < maxlen:
            l.append(0)
        x.append(l)
        l = l[1:]
        l.append(0)
        y.append(l)
    return np.array(x), np.array(y)

In [24]:
xcards, ycards = encode(cardtext)

In [25]:
xcards.shape, ycards.shape

((30303, 256), (30303, 256))

In [26]:
np.save('data/xcards.npy', xcards)
np.save('data/ycards.npy', ycards)