In [2]:
import codecs
import collections
import gzip
import itertools
import json
import re
from operator import itemgetter

import nltk.stem, nltk.corpus
import wget
from gensim import corpora, models, similarities

# Generate documents

In [3]:
try:
    fname = 'AllCards.json.gz'
    cards = json.load(gzip.open(fname, 'rt'))
except FileNotFoundError:
    url = 'https://mtgjson.com/json/AllCards.json.gz'
    fname = wget.download(url)
    cards = json.load(gzip.open(fname, 'rt'))

In [4]:
cards['Mossdog']

{'cmc': 1,
 'colorIdentity': ['G'],
 'colors': ['Green'],
 'imageName': 'mossdog',
 'layout': 'normal',
 'manaCost': '{G}',
 'name': 'Mossdog',
 'power': '1',
 'subtypes': ['Plant', 'Hound'],
 'text': 'Whenever Mossdog becomes the target of a spell or ability an opponent controls, put a +1/+1 counter on Mossdog.',
 'toughness': '1',
 'type': 'Creature — Plant Hound',
 'types': ['Creature']}

In [5]:
card_names = list(cards.keys())
with open('card_names.txt', 'w') as f:
    f.writelines(t + '\n' for t in card_names)

In [6]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/christopher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

In [8]:
def tokenize(card):
    text = ' '.join([card.get('text', '')]
                   # + card.get('types', [])
                   + card.get('subtypes', [])
                    )
    text = text.lower()
    ## Replace card name with ~
    text = text.replace(card['name'].lower(), '~')
    ## remove reminder text (in parentheses)
    text = re.sub(r'\([^)]+\)', '', text)
    ## remove costs
    text = re.sub(r'\{[^}]+\}', '', text)
    ## genericize all p/t (de)buffs
    text = re.sub(r'([+-])[\dX*]/([+-])[\dX*]', r'\1X/\2X', text)
    ## genericize numbers
    text = re.sub(r'\d+', 'N', text)
    ## split on punctuation and spaces
    tokens = re.split(r'[\s.,;:—()]+', text)
    # use only unique tokens?
    # tokens = set(tokens)
    # stem tokens
    tokens = (stemmer.stem(t) for t in tokens if t and t not in stopwords)

    ## The following allows us to singularize certain terms.
    ## For example, the word 'equip' is way over-represented on equipment
    counter = collections.Counter(tokens)
    if counter['equip']:
        counter['equip'] = 1
    
    tokens = itertools.chain.from_iterable([token] * count for token, count in counter.items())
        
    return list(tokens)

documents = [tokenize(c) for c in cards.values()]
# next(documents)

In [9]:
dictionary = corpora.Dictionary(documents)
dictionary.save('dictionary.dict')

In [10]:
print(len(dictionary))

1674


In [11]:
corpus = [dictionary.doc2bow(doc) for doc in documents]
corpora.MmCorpus.serialize('card_text_corpus.mm', corpus)

# Model - LSI

In [12]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [13]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
corpus_lsi = lsi[corpus_tfidf]

In [14]:
lsi.print_topics()[:5]

[(0,
  '0.554*"creatur" + 0.437*"~" + 0.251*"target" + 0.231*"card" + 0.199*"control" + 0.175*"turn" + 0.154*"battlefield" + 0.151*"player" + 0.136*"damag" + 0.131*"n"'),
 (1,
  '-0.559*"card" + 0.517*"creatur" + -0.242*"librari" + -0.229*"~" + -0.175*"put" + 0.155*"enchant" + -0.150*"player" + -0.142*"battlefield" + -0.140*"hand" + 0.129*"turn"'),
 (2,
  '0.634*"~" + -0.454*"card" + -0.348*"creatur" + -0.200*"librari" + 0.195*"counter" + 0.148*"enter" + -0.135*"enchant" + 0.121*"battlefield" + -0.097*"reveal" + -0.097*"target"'),
 (3,
  '-0.424*"player" + -0.420*"damag" + -0.350*"n" + -0.331*"deal" + -0.275*"target" + 0.239*"+x/+x" + 0.219*"battlefield" + 0.191*"get" + 0.167*"end" + 0.146*"enter"'),
 (4,
  '0.573*"turn" + 0.439*"end" + -0.280*"creatur" + -0.272*"enchant" + 0.247*"target" + 0.235*"get" + -0.198*"battlefield" + 0.175*"gain" + 0.146*"+x/+x" + -0.134*"enter"')]

In [15]:
card = cards['Windfall']
card['text']

'Each player discards his or her hand, then draws cards equal to the greatest number of cards a player discarded this way.'

In [16]:
vec_bow = dictionary.doc2bow(tokenize(card))
vec_lsi = lsi[vec_bow]
vec_lsi[:5]

[(0, 1.0192800887160796),
 (1, -1.8133921488776878),
 (2, -1.2717751696568453),
 (3, -0.78778770432061562),
 (4, 0.33429558454834479)]

# Create index

In [17]:
index = similarities.MatrixSimilarity(corpus_lsi)
index.save('all_cards_lsi.index')

In [18]:
def similarity(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lsi = lsi[tfidf[vec_bow]]
    scores = index[vec_lsi]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [19]:
similarity(card)[:10]

[(4209, 1.0),
 (39, 0.99998015),
 (1905, 0.94413352),
 (4282, 0.85083342),
 (9076, 0.8506766),
 (7761, 0.83938605),
 (12776, 0.83860302),
 (8821, 0.81864858),
 (11430, 0.80684882),
 (3508, 0.79541689)]

In [20]:
def get_similar_cards(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score,
                    ' '.join([this_card['name'], this_card.get('manaCost', '')]),
                    this_card['text'],
                    ))
        if len(similar_cards) >= N:
            break
    return similar_cards
        

# Query similar cards

In [21]:
get_similar_cards('Cadaverous Bloom')

[(0.87870228,
  'Inner Fire {3}{R}',
  'Add {R} to your mana pool for each card in your hand.'),
 (0.84190923,
  'Thought Vessel {2}',
  'You have no maximum hand size.\n{T}: Add {C} to your mana pool.'),
 (0.84190923,
  'Reliquary Tower ',
  'You have no maximum hand size.\n{T}: Add {C} to your mana pool.'),
 (0.83300889,
  'Simian Spirit Guide {2}{R}',
  'Exile Simian Spirit Guide from your hand: Add {R} to your mana pool.'),
 (0.80337292,
  'Chrome Mox {0}',
  "Imprint — When Chrome Mox enters the battlefield, you may exile a nonartifact, nonland card from your hand.\n{T}: Add one mana of any of the exiled card's colors to your mana pool."),
 (0.8020587,
  'Scavenger Grounds ',
  '{T}: Add {C} to your mana pool.\n{2}, {T}, Sacrifice a Desert: Exile all cards from all graveyards.'),
 (0.76960075,
  'Serum Powder {3}',
  '{T}: Add {C} to your mana pool.\nAny time you could mulligan and Serum Powder is in your hand, you may exile all the cards from your hand, then draw that many cards.

In [22]:
cards['Bonehoard']['text']

'Living weapon (When this Equipment enters the battlefield, create a 0/0 black Germ creature token, then attach this to it.)\nEquipped creature gets +X/+X, where X is the number of creature cards in all graveyards.\nEquip {2}'

In [23]:
tokenize(cards['Bonehoard'])

['x',
 'live',
 'creatur',
 'creatur',
 '+x/+x',
 'get',
 'number',
 'card',
 'equip',
 'graveyard',
 'weapon']

# Model - LDA

In [24]:
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=100)
corpus_lda = lda[corpus]

In [25]:
index = similarities.MatrixSimilarity(corpus_lda)
index.save('all_cards_lda.index')

In [26]:
def similarity_lda(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_lda = lda[vec_bow]
    scores = index[vec_lda]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [27]:
def get_similar_cards_lda(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_lda(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [28]:
get_similar_cards_lda('Fatal Push')

[(0.9734683,
  'Sun Titan',
  'Vigilance\nWhenever Sun Titan enters the battlefield or attacks, you may return target permanent card with converted mana cost 3 or less from your graveyard to the battlefield.'),
 (0.72890788,
  'Deceiver Exarch',
  'Flash (You may cast this spell any time you could cast an instant.)\nWhen Deceiver Exarch enters the battlefield, choose one —\n• Untap target permanent you control.\n• Tap target permanent an opponent controls.'),
 (0.72890788,
  'Foul Renewal',
  'Return target creature card from your graveyard to your hand. Target creature gets -X/-X until end of turn, where X is the toughness of the card returned this way.'),
 (0.72890788,
  'Ingot Chewer',
  "When Ingot Chewer enters the battlefield, destroy target artifact.\nEvoke {R} (You may cast this spell for its evoke cost. If you do, it's sacrificed when it enters the battlefield.)"),
 (0.72890788,
  'Cataclysmic Gearhulk',
  'Vigilance\nWhen Cataclysmic Gearhulk enters the battlefield, each play

In [29]:
tokenize(cards['Fatal Push'])

['n',
 'n',
 'instead',
 'mana',
 'mana',
 'target',
 'less',
 'less',
 'destroy',
 'destroy',
 'convert',
 'convert',
 'left',
 'perman',
 'creatur',
 'creatur',
 'control',
 'battlefield',
 'revolt',
 'cost',
 'cost',
 'turn']

In [30]:
cards['Fatal Push']

{'cmc': 1,
 'colorIdentity': ['B'],
 'colors': ['Black'],
 'imageName': 'fatal push',
 'layout': 'normal',
 'manaCost': '{B}',
 'name': 'Fatal Push',
 'text': 'Destroy target creature if it has converted mana cost 2 or less.\nRevolt — Destroy that creature if it has converted mana cost 4 or less instead if a permanent you controlled left the battlefield this turn.',
 'type': 'Instant',
 'types': ['Instant']}

# Model - HDP

In [31]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

In [32]:
index = similarities.MatrixSimilarity(corpus_hdp)
index.save('all_cards_hdp.index')

In [33]:
def similarity_hdp(card):
    vec_bow = dictionary.doc2bow(tokenize(card))
    vec_hdp = hdp[vec_bow]
    scores = index[vec_hdp]
    return sorted(enumerate(scores),
            key=itemgetter(1), reverse=True)

In [34]:
def get_similar_cards_hdp(card_name, N=10):
    card = cards[card_name]
    similarity_scores = similarity_hdp(card)
    similar_cards = []
    for name_idx, score in similarity_scores:
        this_card_name = card_names[name_idx]
        if this_card_name != card_name:
            this_card = cards[this_card_name]
            similar_cards.append((score, this_card['name'], this_card.get('text', '')))
        if len(similar_cards) >= N:
            break
    return similar_cards

In [35]:
get_similar_cards_hdp('Windfall')

[(1.0,
  'Zealot of the God-Pharaoh',
  '{4}{R}: Zealot of the God-Pharaoh deals 2 damage to target opponent.'),
 (1.0,
  'Firebrand Archer',
  'Whenever you cast a noncreature spell, Firebrand Archer deals 1 damage to each opponent.'),
 (1.0,
  'Sword of Feast and Famine',
  'Equipped creature gets +2/+2 and has protection from black and from green.\nWhenever equipped creature deals combat damage to a player, that player discards a card and you untap all lands you control.\nEquip {2}'),
 (1.0,
  'Geist of the Lonely Vigil',
  "Defender, flying\nDelirium — Geist of the Lonely Vigil can attack as though it didn't have defender as long as there are four or more card types among cards in your graveyard."),
 (1.0,
  'Poisonbelly Ogre',
  'Whenever another creature enters the battlefield, its controller loses 1 life.'),
 (1.0,
  'Ashen Monstrosity',
  'Haste\nAshen Monstrosity attacks each turn if able.'),
 (1.0,
  'Luminesce',
  'Prevent all damage that black sources and red sources would 

In [36]:
hdp.print_topics()

[(0,
  '0.048*creatur + 0.043*~ + 0.027*card + 0.026*target + 0.019*control + 0.018*turn + 0.016*player + 0.015*battlefield + 0.014*n + 0.013*end'),
 (1,
  '0.012*~ + 0.012*creatur + 0.007*target + 0.007*control + 0.007*turn + 0.005*card + 0.005*counter + 0.004*end + 0.004*whenev + 0.004*+x/+x'),
 (2,
  '0.008*creatur + 0.007*~ + 0.006*either + 0.005*deal + 0.004*target + 0.004*turn + 0.004*whenev + 0.003*brisela + 0.003*unleash + 0.003*card'),
 (3,
  '0.007*creatur + 0.007*card + 0.006*~ + 0.005*target + 0.004*freyalis + 0.004*gain + 0.004*counter + 0.003*chao + 0.003*add + 0.003*player'),
 (4,
  '0.008*~ + 0.007*card + 0.007*creatur + 0.005*mana + 0.005*battlefield + 0.004*librari + 0.004*n + 0.004*pool + 0.004*target + 0.004*put'),
 (5,
  '0.006*creatur + 0.005*~ + 0.005*minor + 0.004*target + 0.004*unspeak + 0.004*away + 0.004*shorter + 0.004*refer + 0.003*card + 0.003*end'),
 (6,
  '0.009*creatur + 0.007*~ + 0.006*turn + 0.005*target + 0.004*card + 0.004*gain + 0.004*brother + 0.0