# Lab 11 - Distributional Semantics

## Task 2 

### Prepare Corpus

In [1]:
targets = ['batman', 'wayne', 'joker']

In [2]:
text = 'Batman is an American superhero. The secret identity of Batman is Bruce Wayne, an American billionaire from Gotham City. The Joker is a supervillain that embodies the ideas of anarchy and chaos. The Joker and Batman fight the battle for Gotham’s soul.'

In [3]:
corpus = text.lower()
corpus

'batman is an american superhero. the secret identity of batman is bruce wayne, an american billionaire from gotham city. the joker is a supervillain that embodies the ideas of anarchy and chaos. the joker and batman fight the battle for gotham’s soul.'

### Init NLTK

In [4]:
import nltk

try:
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
except:
    pass

[nltk_data] Downloading package wordnet to /Users/shawon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shawon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shawon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

### Lemmatize

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
def lemmatize_sentence(sentence):
    tagged = pos_tag(word_tokenize(sentence))
    lemmas = []

    for word, tag in tagged:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'n', 'v', 'r'] else None
        if not wntag:
            lemmas.append(word)
        else:
            lemmas.append(lemmatizer.lemmatize(word, wntag))

    return ' '.join(lemmas)

In [8]:
sentences = corpus.split('.')
sentences

['batman is an american superhero',
 ' the secret identity of batman is bruce wayne, an american billionaire from gotham city',
 ' the joker is a supervillain that embodies the ideas of anarchy and chaos',
 ' the joker and batman fight the battle for gotham’s soul',
 '']

In [9]:
lemmatized_sentences = []

for sentence in sentences:
    lemmatized_sentences.append(lemmatize_sentence(sentence))

In [10]:
lemmatized_sentences

['batman be an american superhero',
 'the secret identity of batman be bruce wayne , an american billionaire from gotham city',
 'the joker be a supervillain that embody the idea of anarchy and chaos',
 'the joker and batman fight the battle for gotham ’ s soul',
 '']

### Target by context matrix

In [11]:
target_v_context = {}
for target in targets:
    target_v_context[target] = []

target_v_context

{'batman': [], 'wayne': [], 'joker': []}

Create context words list

In [12]:
# build pos tag dictionary
tag_dict = {}
for sentence in lemmatized_sentences:
    tagged = pos_tag(word_tokenize(sentence))

    for word, tag in tagged:
        tag_dict[word] = tag

In [13]:
tag_dict

{'batman': 'NN',
 'be': 'VB',
 'an': 'DT',
 'american': 'JJ',
 'superhero': 'NN',
 'the': 'DT',
 'secret': 'JJ',
 'identity': 'NN',
 'of': 'IN',
 'bruce': 'VBN',
 'wayne': 'NN',
 ',': ',',
 'billionaire': 'NN',
 'from': 'IN',
 'gotham': 'JJ',
 'city': 'NN',
 'joker': 'NN',
 'a': 'DT',
 'supervillain': 'NN',
 'that': 'WDT',
 'embody': 'VBP',
 'idea': 'NN',
 'anarchy': 'NN',
 'and': 'CC',
 'chaos': 'NN',
 'fight': 'VBD',
 'battle': 'NN',
 'for': 'IN',
 '’': 'NNP',
 's': 'NN',
 'soul': 'NN'}

In [14]:
lemmatized_sentences

['batman be an american superhero',
 'the secret identity of batman be bruce wayne , an american billionaire from gotham city',
 'the joker be a supervillain that embody the idea of anarchy and chaos',
 'the joker and batman fight the battle for gotham ’ s soul',
 '']

In [15]:
def valid_context_word(word):
    return tag_dict[word].lower()[0] in ['j', 'n', 'v']

In [16]:
def process_token_subarray(sub, window_limit):
    ret = []
    i = 0
    for s in sub:
        if valid_context_word(s) and i <= window_limit:
            ret.append(s)
            i = i + 1
    return ret

In [17]:
def process_list(word_list):
    for word in word_list:
        context_words.add(word)

In [18]:
word_window = 3
context_words = set()

for sentence in lemmatized_sentences:
    if sentence != '':
        print('\n# i: {}\nSentence: {}'.format(lemmatized_sentences.index(sentence), sentence))

    tokens = sentence.split(' ')
    for token in tokens:
        if token in targets:
            print('Target: {}'.format(token))
            context_words.add(token)

            i = tokens.index(token)

            # split the list into left and right and then process them
            left = tokens[:i]
            right = tokens[i:]

            processed_left = process_token_subarray(left, word_window)
            processed_right = process_token_subarray(right, word_window)

            process_list(processed_left + processed_right)

            print('Captured context words : {}'.format(processed_left + processed_right))
            # update the vector
            target_v_context[token] = target_v_context[token] + processed_left + processed_right


# i: 0
Sentence: batman be an american superhero
Target: batman
Captured context words : ['batman', 'be', 'american', 'superhero']

# i: 1
Sentence: the secret identity of batman be bruce wayne , an american billionaire from gotham city
Target: batman
Captured context words : ['secret', 'identity', 'batman', 'be', 'bruce', 'wayne']
Target: wayne
Captured context words : ['secret', 'identity', 'batman', 'be', 'wayne', 'american', 'billionaire', 'gotham']

# i: 2
Sentence: the joker be a supervillain that embody the idea of anarchy and chaos
Target: joker
Captured context words : ['joker', 'be', 'supervillain', 'embody']

# i: 3
Sentence: the joker and batman fight the battle for gotham ’ s soul
Target: joker
Captured context words : ['joker', 'batman', 'fight', 'battle']
Target: batman
Captured context words : ['joker', 'batman', 'fight', 'battle', 'gotham']


In [19]:
context_words

{'american',
 'batman',
 'battle',
 'be',
 'billionaire',
 'bruce',
 'embody',
 'fight',
 'gotham',
 'identity',
 'joker',
 'secret',
 'superhero',
 'supervillain',
 'wayne'}

In [20]:
len(context_words)

15

In [21]:
target_v_context

{'batman': ['batman',
  'be',
  'american',
  'superhero',
  'secret',
  'identity',
  'batman',
  'be',
  'bruce',
  'wayne',
  'joker',
  'batman',
  'fight',
  'battle',
  'gotham'],
 'wayne': ['secret',
  'identity',
  'batman',
  'be',
  'wayne',
  'american',
  'billionaire',
  'gotham'],
 'joker': ['joker',
  'be',
  'supervillain',
  'embody',
  'joker',
  'batman',
  'fight',
  'battle']}

Set target to target matches as 0, for example `batman-batman` : 0

In [22]:
for target in target_v_context.keys():
    for cw in target_v_context[target]:
        if cw == target:
            i = target_v_context[target].index(cw)
            target_v_context[target].pop(i)

target_v_context

{'batman': ['be',
  'american',
  'superhero',
  'secret',
  'identity',
  'be',
  'bruce',
  'wayne',
  'joker',
  'fight',
  'battle',
  'gotham'],
 'wayne': ['secret',
  'identity',
  'batman',
  'be',
  'american',
  'billionaire',
  'gotham'],
 'joker': ['be', 'supervillain', 'embody', 'batman', 'fight', 'battle']}

Build the vector

In [23]:
import collections

vector = {}
features = targets + ['batman * wayne', 'joker * wayne']

for cw in context_words:
    vector[cw] = {}
    for f in features:
        vector[cw][f] = 0


for target in target_v_context.keys():
    c = collections.Counter(target_v_context[target])
    
    for v in vector.keys():
        vector[v][target] = c[v]

for v in vector.keys():
    print(v)
    print(vector[v])
    print()

american
{'batman': 1, 'wayne': 1, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

be
{'batman': 2, 'wayne': 1, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

identity
{'batman': 1, 'wayne': 1, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

bruce
{'batman': 1, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

fight
{'batman': 1, 'wayne': 0, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

batman
{'batman': 0, 'wayne': 1, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

battle
{'batman': 1, 'wayne': 0, 'joker': 1, 'batman * wayne': 0, 'joker * wayne': 0}

gotham
{'batman': 1, 'wayne': 1, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

superhero
{'batman': 1, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

billionaire
{'batman': 0, 'wayne': 1, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

joker
{'batman': 1, 'wayne': 0, 'joker': 0, 'batman * wayne': 0, 'joker * wayne': 0}

embody
{'batman': 0, 'wayne': 0, 'joke

In [24]:
def batman_wayne(batman, wayne):
    return batman * wayne

def joker_wayne(joker, wayne):
    return joker * wayne

In [25]:
for v in vector.keys():
    vector[v]['batman * wayne'] = batman_wayne(vector[v]['batman'], vector[v]['wayne'])
    vector[v]['joker * wayne'] = joker_wayne(vector[v]['joker'], vector[v]['wayne'])

Count vector dims

In [26]:
dim_count = {}

for f in features:
    dim_count[f] = 0

for v in vector.keys():
    for d in dim_count.keys():
        dim_count[d] = dim_count[d] + vector[v][d]

dim_count

{'batman': 12, 'wayne': 7, 'joker': 6, 'batman * wayne': 6, 'joker * wayne': 2}

Compute Cosines

In [27]:
import math

def cosine(uv, u, v):
    return uv / (math.sqrt(u) * math.sqrt(v))

wayne is batman

In [28]:
w_bat = cosine(dim_count['batman * wayne'], dim_count['batman'], dim_count['wayne'])
w_bat

0.6546536707079772

wayne is joker

In [29]:
w_joker = cosine(dim_count['joker * wayne'], dim_count['joker'], dim_count['wayne'])
w_joker

0.3086066999241838

So, who's Wayne actually?

The lesser the value of `1-cosine` is, the higher the similarity will be.

In [30]:
print('Wayne is Batman!') if (1 - w_bat) < (1 - w_joker) else print('Wayne is the Joker!')

Wayne is Batman!


## Task 3

In [31]:
!head 'Lab 11/GoogleNews-subset'

Batman	0.17089844	-0.15527344	-0.06591797	0.08886719	0.052978516	-0.048583984	0.19140625	0.08203125	0.20507812	-0.052490234	0.12695312	0.087402344	0.23535156	-0.10058594	-0.30078125	-0.006225586	-0.17480469	-0.18554688	-0.010192871	0.11425781	-0.024780273	0.012756348	0.265625	0.048095703	-0.31640625	-0.05859375	0.09814453	0.123046875	-0.0002975464	-0.18652344	0.10449219	-0.14257812	-0.052490234	0.14160156	0.072265625	0.29101562	0.14453125	0.24023438	-0.16894531	0.5546875	0.35742188	-0.18847656	0.103027344	0.11621094	-0.087402344	-0.18652344	-0.004119873	-0.15625	-0.018066406	0.19921875	-0.25585938	-0.013000488	-0.15332031	0.19335938	-0.040527344	0.075683594	0.12402344	-0.13378906	-0.16015625	-0.20507812	0.030029297	0.24316406	0.020141602	0.0043945312	-0.028076172	0.07373047	-0.5625	-0.13769531	-0.1484375	-0.061035156	0.0036315918	0.11621094	-0.12988281	-0.02355957	-0.30078125	-0.059814453	0.31640625	0.07080078	0.09082031	-0.27929688	-0.100097656	-0.13769531	-0.5546875	-0.48632812	-0.02

In [32]:
file_name = 'Lab 11/GoogleNews-subset'

"""
Batman : 0
Wayne: 1
Joker: 2
"""

vec = {}

with open(file_name, 'r') as f:
    for line in f.readlines():
        t = line.split()
        target = t[0]
        vec[target] = [ float(x) for x in t[1:] ]

vec

{'Batman': [0.17089844,
  -0.15527344,
  -0.06591797,
  0.08886719,
  0.052978516,
  -0.048583984,
  0.19140625,
  0.08203125,
  0.20507812,
  -0.052490234,
  0.12695312,
  0.087402344,
  0.23535156,
  -0.10058594,
  -0.30078125,
  -0.006225586,
  -0.17480469,
  -0.18554688,
  -0.010192871,
  0.11425781,
  -0.024780273,
  0.012756348,
  0.265625,
  0.048095703,
  -0.31640625,
  -0.05859375,
  0.09814453,
  0.123046875,
  -0.0002975464,
  -0.18652344,
  0.10449219,
  -0.14257812,
  -0.052490234,
  0.14160156,
  0.072265625,
  0.29101562,
  0.14453125,
  0.24023438,
  -0.16894531,
  0.5546875,
  0.35742188,
  -0.18847656,
  0.103027344,
  0.11621094,
  -0.087402344,
  -0.18652344,
  -0.004119873,
  -0.15625,
  -0.018066406,
  0.19921875,
  -0.25585938,
  -0.013000488,
  -0.15332031,
  0.19335938,
  -0.040527344,
  0.075683594,
  0.12402344,
  -0.13378906,
  -0.16015625,
  -0.20507812,
  0.030029297,
  0.24316406,
  0.020141602,
  0.0043945312,
  -0.028076172,
  0.07373047,
  -0.5625,
  -

In [33]:
vec.keys()

dict_keys(['Batman', 'Wayne', 'Joker'])

In [34]:
import numpy as np

batman = np.array(vec['Batman'])
wayne = np.array(vec['Wayne'])
joker = np.array(vec['Joker'])

In [35]:
def get_cosine_sim(u, v):
    prod = u.dot(v)
    sim = prod / np.linalg.norm(u) * np.linalg.norm(v) 

    return sim

In [36]:
get_cosine_sim(batman, wayne)

1.3051724299819152

In [37]:
get_cosine_sim(joker, wayne)

0.4830780820827692

Still holds!