update: change {c} to colorless diamond symbol: Ⓒ

In [1]:
import json
import numpy as np
import pandas as pd
import re

from collections import Counter

## read card data from json

json file available at https://mtgjson.com

In [2]:
data = pd.read_json("data//AllSets.json", 'r')

In [3]:
# example card
card = data['M15']['cards'][15]
card

{'artist': 'Thomas Denmark',
 'cmc': 2,
 'colorIdentity': ['W'],
 'colors': ['White'],
 'flavor': '"If a boggart even dares breathe near one of my kin, I\'ll know. And I\'ll not be happy."',
 'id': 'b373a0db34c644c48fe0245610a3c32ff144179b',
 'imageName': 'kinsbaile skirmisher',
 'layout': 'normal',
 'manaCost': '{1}{W}',
 'mciNumber': '16',
 'multiverseid': 383292,
 'name': 'Kinsbaile Skirmisher',
 'number': '16',
 'power': '2',
 'rarity': 'Common',
 'subtypes': ['Kithkin', 'Soldier'],
 'text': 'When Kinsbaile Skirmisher enters the battlefield, target creature gets +1/+1 until end of turn.',
 'toughness': '2',
 'type': 'Creature — Kithkin Soldier',
 'types': ['Creature']}

## parse data into lists

In [4]:
setnm = [] # {set}
idnum = [] # 'id'
names = [] # 'name'
mcost = [] # 'manaCost'
power = [] # 'power'
tough = [] # 'toughness'
rarty = [] # 'rarity'
types = [] # 'types'
subtp = [] # 'subtypes'
ctext = [] # 'text'
layot = [] # 'layout' >>> use to confirm not flippy or split

split_excludes = 0

def addcard(this_set, card):
    
    # add values
    setnm.append(this_set)
    idnum.append(card['id'])
    names.append(card['name'])
    rarty.append(card['rarity'])
    layot.append(card['layout'])

    if 'types' in card.keys():
        types.append(' ' .join(card['types']))
    else:
        types.append('token')

    if 'manaCost' in card.keys():
        mcost.append(card['manaCost'])
    else:
        mcost.append(0)

    if 'text' in card.keys():
        ctext.append(card['text'])
    else:
        ctext.append('')    

    if 'subtypes' in card.keys():
        subtp.append(card['subtypes'])
    else:
        subtp.append(['PAD'])

    if 'power' in card.keys():
        power.append(card['power'])
    else:
        power.append('⌧')

    if 'toughness' in card.keys():
        tough.append(card['toughness'])
    else:
        tough.append('⌧')
    
    return

for this_set in list(data.columns):
    set_cards = data[this_set]['cards']
    for card in set_cards:
        
        # only get normal(not flippy ,split, etc) cards, 
        # and exclude pwalkers, conspiracies, unsets, and tokens (no type)
        # and exclude lands and hybrid mana costs for now...
        # and exclude cards with funky mana too
        if card['layout'] == 'normal' and 'types' in card.keys():
            
            if card['types'] not in ('Conspiracy', 'Planeswalker') and this_set not in ('UGL', 'UNH'):

                if 'manaCost' in card.keys():
                    
                    if re.search(r'\{[0-9A-Z]\/[0-9A-Z]\}', card['manaCost']) == None:
                        
                        # check creature for strange manacost
                        if 'toughness' in card.keys() and 'power' in card.keys():
                        
                            if re.search(r'[^0-9]', card['power']) == None and re.search(r'[^0-9]', card['toughness']) == None:
                            
                                addcard(this_set, card)
                                
                            else:
                                
                                split_excludes += 1
                        
                        else:
                            
                            addcard(this_set, card)
                            
                    else:
                        
                        split_excludes += 1
                
print("total included cards:", len(idnum))
print("excluded b/c funny mcost or P/T:", split_excludes)

total included cards: 29347
excluded b/c funny mcost or P/T: 762


## analysis

In [5]:
Counter(rarty).most_common()

[('Common', 10505),
 ('Uncommon', 8684),
 ('Rare', 7782),
 ('Special', 1441),
 ('Mythic Rare', 935)]

In [6]:
Counter(types).most_common()

[('Creature', 14135),
 ('Instant', 4068),
 ('Sorcery', 3722),
 ('Enchantment', 3711),
 ('Artifact', 2408),
 ('Artifact Creature', 933),
 ('Planeswalker', 179),
 ('Enchantment Creature', 98),
 ('Tribal Instant', 33),
 ('Tribal Sorcery', 26),
 ('Tribal Enchantment', 21),
 ('Enchantment Artifact', 8),
 ('Tribal Artifact', 5)]

In [7]:
allsubtypes = [st for lst in subtp for st in lst]
print("total number of subtypes:", len(set(allsubtypes)))
Counter(allsubtypes).most_common()[:10]

total number of subtypes: 262


[('PAD', 11717),
 ('Human', 3069),
 ('Aura', 1659),
 ('Wizard', 951),
 ('Warrior', 949),
 ('Soldier', 925),
 ('Elemental', 672),
 ('Elf', 663),
 ('Zombie', 649),
 ('Spirit', 620)]

In [8]:
max([len(l) for l in subtp]), subtp[np.argmax([len(l) for l in subtp])]

(4, ['Centaur', 'Druid', 'Scout', 'Archer'])

In [9]:
# char-level text length analysis - uncomment to see
lens = Counter([len(n) for n in ctext]).most_common()
x = sorted(lens, key=lambda x: x[0], reverse=True)
y = [t[0] for t in x]
z = [t[1] for t in x]
print('max length threshhold:', max(y), "chars")
s = 0
for i in range(len(y)):
    s += z[i] 
    if s > int(sum(z)*0.05):
        print('95% length threshhold:', y[i], "chars")
        break

max length threshhold: 656 chars
95% length threshhold: 281 chars


In [10]:
# word-level text length analysis - uncomment to see
lens = Counter([len(n.split()) for n in ctext]).most_common()
x = sorted(lens, key=lambda x: x[0], reverse=True)
y = [t[0] for t in x]
z = [t[1] for t in x]
print('max length threshhold:', max(y), "words")
s = 0
for i in range(len(y)):
    s += z[i] 
    if s > int(sum(z)*0.05):
        print('95% length threshhold:', y[i], "words")
        break

max length threshhold: 113 words
95% length threshhold: 49 words


In [11]:
# char-level name length analysis - uncomment to see
lens = Counter([len(n) for n in names]).most_common()
x = sorted(lens, key=lambda x: x[0], reverse=True)
y = [t[0] for t in x]
z = [t[1] for t in x]
print('max length threshhold:', max(y), "chars")
s = 0
for i in range(len(y)):
    s += z[i] 
    if s > int(sum(z)*0.05):
        print('95% length threshhold:', y[i], "chars")
        break

max length threshhold: 31 chars
95% length threshhold: 21 chars


In [12]:
# longest name, longest ability text
maxname = max([len(n) for n in names])
maxidx = np.argmax([len(n) for n in names])
print("longest name:", names[maxidx], maxname)
print('')
maxtext = max([len(n.split()) for n in ctext])
maxtidx = np.argmax([len(n.split()) for n in ctext])
print("longest text:", '\n', names[maxtidx], '\n', ctext[maxtidx], maxtext)

longest name: Circle of Protection: Artifacts 31

longest text: 
 Master of the Hunt 
 {2}{G}{G}: Create a 1/1 green Wolf creature token named Wolves of the Hunt. It has "bands with other creatures named Wolves of the Hunt." (Any creatures named Wolves of the Hunt can attack in a band as long as at least one has "bands with other creatures named Wolves of the Hunt." Bands are blocked as a group. If at least two creatures named Wolves of the Hunt you control, one of which has "bands with other creatures named Wolves of the Hunt," are blocking or being blocked by the same creature, you divide that creature's combat damage, not its controller, among any of the creatures it's being blocked by or is blocking.) 113


## preprocessing

replace certain things with unique tokens

In [13]:
pad_sub = max([len(l) for l in subtp])

for i in range(len(idnum)):
    # lowercase'⌧')
    ctext[i] = ctext[i].lower()
    names[i] = names[i].lower()
    
    # replace newlines with pipes
    ctext[i] = ctext[i].replace('\n', '|')
    
    # replace cardname in text with 'Ⓝ'
    ctext[i] = ctext[i].replace(names[i], 'Ⓝ')

    # replace tap {T} with '↷' in text :
    ctext[i] = ctext[i].replace('{t}', '↷')
    
    # replace '+1/+1', '-1/-1' (?=counter) with '⊕', '⊖' in text:
    # ctext[i] = re.sub(r'\+1\/\+1(?=\scounter)', '⊕', ctext[i])
    # ctext[i] = re.sub(r'\-1\/\-1(?=\scounter)', '⊕', ctext[i])
    
    # MANA COSTS: 
    # replace {C} with 'Ⓦ', 'Ⓤ', 'Ⓑ', 'Ⓡ', 'Ⓖ' in ctext, mcost
    colors = ['{W}', '{U}', '{B}', '{R}', '{G}', '{C}', '{w}', '{u}', '{b}', '{r}', '{g}', '{c}']
    symbols = ['Ⓦ', 'Ⓤ', 'Ⓑ', 'Ⓡ', 'Ⓖ', 'Ⓒ', 'Ⓦ', 'Ⓤ', 'Ⓑ', 'Ⓡ', 'Ⓖ', 'Ⓒ']
    for j, color in enumerate(colors):
        ctext[i] = ctext[i].replace(color, symbols[j])
        mcost[i] = mcost[i].replace(color, symbols[j])
        
    # replace {#} with number icon and {X} with 'Ⓧ':
    numbers = ['{x}', '{X}', '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12', '{13}', '{14}', '{15}', '{16}', '{17}', '{18}', '{19}', '{20}']
    nsymbols= ['Ⓧ', 'Ⓧ', '①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⑪', '⑫', '⑬', '⑭', '⑮', '⑯', '⑰', '⑱', '⑲', '⑳']
    for j, number in enumerate(numbers):
        ctext[i] = ctext[i].replace(number, nsymbols[j])
        mcost[i] = mcost[i].replace(number, nsymbols[j])
        
    # pad subtype with '⌧'
    while len(subtp[i]) < pad_sub:
        if type(subtp[i]) == str:
            print(subtp[i])
        else:
            subtp[i].append('PAD')

## create a `pandas` dataframe and save data

In [14]:
carddf = pd.DataFrame({
    'set' : setnm,
    'card_id' : idnum,
    'card_name' : names,
    'mana_cost' : mcost,
    'rarity' : rarty,
    'type' : types,
    'subtypes' : subtp,
    'card_text' : ctext,
    'power' : power,
    'toughness' : tough    
})

carddf = carddf[['set', 'card_id', 'rarity', 'card_name', 'mana_cost', 'type', 'subtypes', 'power', 'toughness', 'card_text']]

In [15]:
carddf.to_csv('data//card_database.csv', index=False)
np.save('data//card_database.npy', carddf)

In [16]:
np.save('data//numpy//setnm.npy', setnm)
np.save('data//numpy//idnum.npy', idnum)
np.save('data//numpy//names.npy', names)
np.save('data//numpy//mcost.npy', mcost)
np.save('data//numpy//rarty.npy', rarty)
np.save('data//numpy//types.npy', types)
np.save('data//numpy//subtp.npy', subtp)
np.save('data//numpy//ctext_orig.npy', ctext)
np.save('data//numpy//power.npy', power)
np.save('data//numpy//tough.npy', tough) 

## view some cards

In [17]:
iters = iter([i for i in range(len(idnum))])

In [18]:
i = next(iters)
print(names[i], '\t', mcost[i])
print('')
if subtp[i][0] != '⌧':
    sub = '- ' + ' '.join(subtp[i])
else:
    sub = '\t'
print(types[i], sub, '(', setnm[i], rarty[i], ')')
print('')
print(ctext[i].replace('|', '\n\n'))
print('')
if power[i] != '⌧' and tough[i] != '⌧':
    print(power[i], '/', tough[i])

ancestor's chosen 	 ⑤ⓌⓌ

Creature - Human Cleric PAD PAD ( 10E Uncommon )

first strike (this creature deals combat damage before creatures without first strike.)

when Ⓝ enters the battlefield, you gain 1 life for each card in your graveyard.

4 / 4


## testing ability vocab

In [19]:
# preprocessing: properly space all symbols
# also, remove all hint text (text in parentheses)
dels = []
dtext = []
for idx, txt in enumerate(ctext):
    
    if re.search(r'\([0-9A-Za-z\s\.\,\'\"]+\)', txt):
        dels.append(re.search(r'\([0-9A-Za-z\s\.\,\'\"]+\)', txt))
    
    txt = re.sub(r'\([0-9A-Za-z\s\.\,\'\"]+\)', '', txt)
    
    rep = re.findall(r'[^0-9A-Za-z\s\'\-\+\/]', txt)
    for r in rep:
        txt = txt.replace(r, ' '+r+' ')
    txt = re.sub(r'[\s]+', ' ', txt)
    dtext.append(txt)

In [20]:
i = next(iters)
print(names[i], '\t', mcost[i])
print('')
if subtp[i][0] != '⌧':
    sub = '- ' + ' '.join(subtp[i])
else:
    sub = '\t'
print(types[i], sub, '(', setnm[i], rarty[i], ')')
print('')
print(dtext[i].replace('|', '\n\n'))
print('')
if power[i] != '⌧' and tough[i] != '⌧':
    print(power[i], '/', tough[i])

angel of mercy 	 ④Ⓦ

Creature - Angel PAD PAD PAD ( 10E Uncommon )

flying 

 when Ⓝ enters the battlefield , you gain 3 life . 

3 / 3


In [21]:
# split on space and get vocab
dlists = [x.split(' ') for x in dtext]
dvocab = [st for lst in dlists for st in lst]
dcount = Counter(dvocab)
len(set(dvocab))

1779

In [22]:
dcount.most_common()

[('.', 46141),
 ('', 34506),
 (',', 25353),
 ('creature', 19094),
 ('Ⓝ', 18083),
 ('|', 17389),
 ('the', 16157),
 ('a', 15708),
 ('you', 15145),
 ('of', 13441),
 ('target', 12140),
 ('to', 11754),
 ('your', 11423),
 (':', 9671),
 ('card', 8336),
 ('or', 7938),
 ('turn', 7566),
 ('it', 7385),
 ('battlefield', 6720),
 ('this', 6570),
 ('that', 6378),
 ('player', 6180),
 ('damage', 6100),
 ('and', 5890),
 ('control', 5876),
 ('end', 5482),
 ('until', 4881),
 ('each', 4591),
 ('put', 4525),
 ('if', 4479),
 ('may', 4383),
 ('flying', 4374),
 ('creatures', 4321),
 ('whenever', 4277),
 ('on', 4221),
 ('when', 4220),
 ('enters', 4122),
 ('deals', 4093),
 ('from', 4045),
 ('library', 3926),
 ('↷', 3874),
 ('with', 3800),
 ('as', 3778),
 ('gets', 3738),
 ('cards', 3662),
 ('hand', 3598),
 ('for', 3594),
 ('+1/+1', 3366),
 ('mana', 3347),
 ('at', 3190),
 ('an', 3164),
 ('counter', 3148),
 ('its', 3064),
 ('life', 3025),
 ('graveyard', 2930),
 ('cast', 2829),
 ('sacrifice', 2772),
 ('①', 2758),
 (

In [23]:
np.save('data//numpy//ctext.npy', dtext)