# Who is Wayne? Joker or Batman?

## Corpus

In [1]:
text = 'Batman is an American superhero.  The Joker is a supervillain that embodies the ideas of anarchy and chaos. The Joker and Batman fight the battle for Gotham’s soul. Like them, Bruce Wayne, an American billionaire is also from Gotham City. Is he hiding behind the face of these two?'

targets = ['batman', 'wayne', 'joker']

corpus = text.lower()

## Lemmatization

### Init NLTK

In [2]:
import nltk

try:
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
except:
    pass

[nltk_data] Downloading package wordnet to /Users/shawon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shawon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shawon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

### Lemmatize

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
def lemmatize_sentence(sentence):
    tagged = pos_tag(word_tokenize(sentence))
    lemmas = []

    for word, tag in tagged:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'n', 'v', 'r'] else None
        if not wntag:
            lemmas.append(word)
        else:
            lemmas.append(lemmatizer.lemmatize(word, wntag))

    return ' '.join(lemmas)

In [6]:
sentences = corpus.split('.')
lemmatized_sentences = []

In [7]:
for sentence in sentences:
    lemmatized_sentences.append(lemmatize_sentence(sentence))

In [8]:
lemmatized_sentences

['batman be an american superhero',
 'the joker be a supervillain that embody the idea of anarchy and chaos',
 'the joker and batman fight the battle for gotham ’ s soul',
 'like them , bruce wayne , an american billionaire be also from gotham city',
 'be he hide behind the face of these two ?']

In [9]:
'.'.join(lemmatized_sentences)

'batman be an american superhero.the joker be a supervillain that embody the idea of anarchy and chaos.the joker and batman fight the battle for gotham ’ s soul.like them , bruce wayne , an american billionaire be also from gotham city.be he hide behind the face of these two ?'

### Target by Context matrix

In [10]:
target_v_context = {}
for target in targets:
    target_v_context[target] = []

target_v_context

{'batman': [], 'wayne': [], 'joker': []}

In [11]:
# build pos tag dictionary
tag_dict = {}
for sentence in lemmatized_sentences:
    tagged = pos_tag(word_tokenize(sentence))

    for word, tag in tagged:
        tag_dict[word] = tag

In [12]:
tag_dict

{'batman': 'NN',
 'be': 'VB',
 'an': 'DT',
 'american': 'JJ',
 'superhero': 'NN',
 'the': 'DT',
 'joker': 'NN',
 'a': 'DT',
 'supervillain': 'NN',
 'that': 'WDT',
 'embody': 'VBP',
 'idea': 'NN',
 'of': 'IN',
 'anarchy': 'NN',
 'and': 'CC',
 'chaos': 'NN',
 'fight': 'VBD',
 'battle': 'NN',
 'for': 'IN',
 'gotham': 'JJ',
 '’': 'NNP',
 's': 'NN',
 'soul': 'NN',
 'like': 'IN',
 'them': 'PRP',
 ',': ',',
 'bruce': 'VB',
 'wayne': 'NN',
 'billionaire': 'NN',
 'also': 'RB',
 'from': 'IN',
 'city': 'NN',
 'he': 'PRP',
 'hide': 'VB',
 'behind': 'IN',
 'face': 'NN',
 'these': 'DT',
 'two': 'CD',
 '?': '.'}

In [13]:
def valid_context_word(word):
    return tag_dict[word].lower()[0] in ['j', 'n', 'v']

def process_token_subarray(sub, window_limit):
    ret = []
    i = 0
    for s in sub:
        if valid_context_word(s) and i <= window_limit:
            ret.append(s)
            i = i + 1
    return ret

In [14]:
def process_list(word_list):
    for word in word_list:
        context_words.add(word)

In [15]:
word_window = 5
context_words = set()

for sentence in lemmatized_sentences:
    if sentence != '':
        print('\n# i: {}\nSentence: {}'.format(lemmatized_sentences.index(sentence), sentence))

    tokens = sentence.split(' ')
    for token in tokens:
        if token in targets:
            print('Target: {}'.format(token))
            context_words.add(token)

            i = tokens.index(token)

            # split the list into left and right and then process them
            left = tokens[:i]
            right = tokens[i:]

            processed_left = process_token_subarray(left, word_window)
            processed_right = process_token_subarray(right, word_window)

            process_list(processed_left + processed_right)

            print('Captured context words : {}'.format(processed_left + processed_right))
            # update the vector
            target_v_context[token] = target_v_context[token] + processed_left + processed_right


# i: 0
Sentence: batman be an american superhero
Target: batman
Captured context words : ['batman', 'be', 'american', 'superhero']

# i: 1
Sentence: the joker be a supervillain that embody the idea of anarchy and chaos
Target: joker
Captured context words : ['joker', 'be', 'supervillain', 'embody', 'idea', 'anarchy']

# i: 2
Sentence: the joker and batman fight the battle for gotham ’ s soul
Target: joker
Captured context words : ['joker', 'batman', 'fight', 'battle', 'gotham', '’']
Target: batman
Captured context words : ['joker', 'batman', 'fight', 'battle', 'gotham', '’', 's']

# i: 3
Sentence: like them , bruce wayne , an american billionaire be also from gotham city
Target: wayne
Captured context words : ['bruce', 'wayne', 'american', 'billionaire', 'be', 'gotham', 'city']

# i: 4
Sentence: be he hide behind the face of these two ?


In [16]:
context_words

{'american',
 'anarchy',
 'batman',
 'battle',
 'be',
 'billionaire',
 'bruce',
 'city',
 'embody',
 'fight',
 'gotham',
 'idea',
 'joker',
 's',
 'superhero',
 'supervillain',
 'wayne',
 '’'}

### Set target to target matches as 0, for example batman-batman : 0

In [17]:
for target in target_v_context.keys():
    for cw in target_v_context[target]:
        if cw == target:
            i = target_v_context[target].index(cw)
            target_v_context[target].pop(i)

### Build the vectors

In [18]:
import collections

vector = {}
features = targets + ['batman * wayne', 'joker * wayne']

for cw in context_words:
    vector[cw] = {}
    for f in features:
        vector[cw][f] = 0


for target in target_v_context.keys():
    c = collections.Counter(target_v_context[target])
    
    for v in vector.keys():
        vector[v][target] = c[v]

for v in vector.keys():
    print(v)
    print(vector[v])
    print()

superhero
{'batman': 1, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

billionaire
{'batman': 0, 'wayne': 1, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

idea
{'batman': 0, 'wayne': 0, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

battle
{'batman': 1, 'wayne': 0, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

’
{'batman': 1, 'wayne': 0, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

wayne
{'batman': 0, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

joker
{'batman': 1, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

be
{'batman': 1, 'wayne': 1, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

s
{'batman': 1, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

batman
{'batman': 0, 'wayne': 0, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

american
{'batman': 1, 'wayne': 1, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

gotham
{'batman': 1, 'wayne': 1, 'joker': 1, 'batma

In [19]:
def batman_wayne(batman, wayne):
    return batman * wayne

def joker_wayne(joker, wayne):
    return joker * wayne

In [20]:
for v in vector.keys():
    vector[v]['batman * wayne'] = batman_wayne(vector[v]['batman'], vector[v]['wayne'])
    vector[v]['joker * wayne'] = joker_wayne(vector[v]['joker'], vector[v]['wayne'])

### Count vector dims

In [21]:
dim_count = {}

for f in features:
    dim_count[f] = 0

for v in vector.keys():
    for d in dim_count.keys():
        dim_count[d] = dim_count[d] + vector[v][d]

dim_count

{'batman': 9, 'wayne': 6, 'joker': 10, 'batman * wayne': 3, 'joker * wayne': 2}

### Compute Cosines

In [22]:
import math

def cosine(uv, u, v):
    return uv / (math.sqrt(u) * math.sqrt(v))

In [23]:
w_bat = cosine(dim_count['batman * wayne'], dim_count['batman'], dim_count['wayne'])
w_bat

0.4082482904638631

In [24]:
w_joker = cosine(dim_count['joker * wayne'], dim_count['joker'], dim_count['wayne'])
w_joker

0.25819888974716115

## So, who's Wayne actually? 

In [25]:
print('Wayne is Batman!') if (1 - w_bat) < (1 - w_joker) else print('Wayne is the Joker!')

Wayne is Batman!
