In [1]:
import numpy as np
from gensim.models import KeyedVectors

### Part 1: Synonyms

In this section, your goal is to answer questions in the following format:
```
What is a synonym for warrior?
a) soldier
b) sailor
c) pirate
d) spy
```
You are given a word and a list of candidate choices as input. Your objective is to return the choice that you believe is the synonym. To accomplish this, you'll first implement two similarity metrics: Euclidean distance and cosine similarity. Then, you'll use these metrics to answer the multiple-choice questions.
Specifically, you will implement the following three functions:

1. `euclidean_distance()`: Calculate the Euclidean distance between two vectors.

1. `cosine_similarity()`: Calculate the cosine similarity between two vectors.

1. `find_synonym()`: Given a word, a list of four candidate choices, and a specified similarity metric, return the word that you think is the synonym. The function takes comparison_metric as a parameter: if its value is euc_dist, you'll use Euclidean distance as the similarity metric; if its value is cosine_sim, you'll use cosine similarity as the metric.

In [2]:
def load_synonym_qs(file_path):
    synonym_qs = []
    with open(file_path, 'r') as file:
        next(file)  # Skip the header line
        for line in file:
            parts = line.strip().split('\t')
            word = parts[0]
            choices = parts[1].split(',')
            true_answer = parts[2]
            synonym_qs.append({
                "word": word,
                "choices": choices,
                "true_answer": true_answer
            })
    return synonym_qs

In [3]:
embeddings = KeyedVectors.load_word2vec_format('./w2v-negative300.txt', binary=False)
embeddings.get_vector('warrior')

array([ 0.453125  ,  0.2421875 ,  0.10058594,  0.16992188,  0.24414062,
        0.41210938,  0.10400391, -0.16699219, -0.01623535,  0.03015137,
        0.07275391, -0.22265625, -0.02807617, -0.04443359, -0.03735352,
       -0.01434326, -0.21582031,  0.00872803, -0.0300293 , -0.31445312,
       -0.12695312,  0.03613281,  0.07470703, -0.13085938, -0.14550781,
        0.02563477, -0.28320312, -0.08984375,  0.12695312, -0.23046875,
        0.48828125,  0.14648438,  0.14648438,  0.34765625,  0.0402832 ,
        0.04907227,  0.13085938,  0.06591797, -0.14160156, -0.05957031,
        0.21386719, -0.18261719,  0.23632812,  0.15234375,  0.19140625,
       -0.17578125, -0.14746094, -0.08007812,  0.22167969, -0.12353516,
       -0.19140625,  0.24121094, -0.15722656, -0.05786133, -0.07177734,
        0.15820312, -0.07519531, -0.05371094, -0.06738281,  0.15722656,
       -0.31640625,  0.16699219, -0.17285156,  0.11523438, -0.06445312,
        0.00927734,  0.12255859,  0.00811768, -0.21777344,  0.21

In [4]:
def cosine_similarity(v1, v2):
    '''
    Calculates and returns the cosine similarity between vectors v1 and v2
    Arguments:
        v1 (np.array), v2 (np.array): vectors
    Returns:
        cosine_sim (float): the cosine similarity between v1, v2
    '''
    cosine_sim = 0
    cosine_sim = (v1 @ v2) / (np.sqrt(sum((v1**2))) * np.sqrt(sum(v2**2)))
    return cosine_sim

def euclidean_distance(v1, v2):
    '''
    Calculates and returns the euclidean distance between v1 and v2

    Arguments:
        v1 (np.array), v2 (np.array): vectors

    Returns:
        euclidean_dist (float): the euclidean distance between v1, v2
    '''
    euclidean_dist = 0
    euclidean_dist = np.sqrt(sum((v1-v2)**2))
    return euclidean_dist

def find_synonym(word, choices, embeddings, comparison_metric):
    scores = []
    word_vector = embeddings.get_vector(word)
    for choice in choices:
        choice_vector = embeddings.get_vector(choice)
        if comparison_metric == 'cosine_sim':
            score = cosine_similarity(word_vector, choice_vector)
        else:
            score = euclidean_distance(word_vector, choice_vector)
        scores.append(score)

    if comparison_metric == 'cosine_sim':
        answer_index = np.argmax(scores)
    else:
        answer_index = np.argmin(scores)

    return choices[answer_index]

class Part1_Runner():
    def __init__(self, find_synonym):
        self.find_synonym = find_synonym
        
        # load embeddings
        self.embeddings = KeyedVectors.load_word2vec_format('./w2v-negative300.txt', binary=False)

        # load questions
        self.synonym_qs = load_synonym_qs('./synonyms.csv')
    
    def evaluate(self, print_q=True):
        print('Part 1: Synonyms')
        print('-----------------')
        acc_euc_dist = self.get_synonym_acc('euc_dist', self.embeddings, self.synonym_qs, print_q)
        acc_cosine_sim = self.get_synonym_acc('cosine_sim', self.embeddings, self.synonym_qs, print_q)
        print('accuracy using euclidean distance: %.5f' % acc_euc_dist)
        print('accuracy using cosine similarity : %.5f' % acc_cosine_sim)
        return acc_euc_dist, acc_cosine_sim
    
    def get_synonym_acc(self, comparison_metric, embeddings, synonym_qs, print_q=False):
        '''
        Helper function to compute synonym answering accuracy
        '''
        if print_q:
            metric_str = 'cosine similarity' if comparison_metric == 'cosine_sim' else 'euclidean distance'
            print('Answering part 1 using %s as the comparison metric...' % metric_str)
        
        n_correct = 0
        for i, item in enumerate(synonym_qs):
            w = item['word']
            choices = item['choices']
            answer = item['true_answer']
            
            ans = self.find_synonym(w, choices, embeddings, comparison_metric)
            if ans == answer:
                n_correct += 1
            
            if print_q:
                print('%d. What is a synonym for %s?' % (i+1, w))
                print('Choices: %s' % ', '.join(choices))
                print('You answered: %s' % ans)
                print('Correct answer is: %s\n' % answer)
        acc = n_correct / len(synonym_qs)
        return acc

You should get an accuracy of 80% with euclidean distance and 90% with cosine distance.

In [5]:
part1 = Part1_Runner(find_synonym)
part1.evaluate(True)  # To only print the scores, pass in False as an argument

Part 1: Synonyms
-----------------
Answering part 1 using euclidean distance as the comparison metric...
1. What is a synonym for gullible?
Choices: unrealistic, naive, complicated, wary
You answered: naive
Correct answer is: naive

2. What is a synonym for counter?
Choices: parry, agree, hold, run
You answered: run
Correct answer is: parry

3. What is a synonym for feeble?
Choices: reinforced, weak, damage, break
You answered: weak
Correct answer is: weak

4. What is a synonym for administer?
Choices: give, steal, spray, box
You answered: give
Correct answer is: give

5. What is a synonym for betray?
Choices: trust, inform, table, deceive
You answered: trust
Correct answer is: deceive

6. What is a synonym for scour?
Choices: search, allow, gaze, gather
You answered: search
Correct answer is: search

7. What is a synonym for clean?
Choices: bare, tidy, rummage, pop
You answered: tidy
Correct answer is: tidy

8. What is a synonym for abscond?
Choices: escape, rally, relinquish, flash
Y

(0.8, 0.9)

### Part 2: Analogies
In this section, your goal is to answer questions of the form:
```
man is to king as woman is to ?  
  a) princess  
  b) queen  
  c) wife  
  d) ruler
```
In other words, you are trying to find the word `bb` that completes the analogy **a:b → aa:bb**. You will take three words, `a`, `b`, `aa`, and a list of candidate choices as input and return the choice that you think best completes the analogy.

One of the fascinating properties of word embeddings is their ability to capture relational meanings. In fact, for the analogy **man:king → woman:queen** mentioned above, we have the following relationship:
`vector('king')` - `vector('man')` + `vector('woman')`.


This resulting vector is close to `vector('queen')`. When completing these analogies, make sure to follow the same logical order as the example above to align with the test set. 
Specifically, you will implement the following function:

`find_analogy_word()`: Given words `a`, `b`, and `aa`, find the best word in a list of candidate choices that completes the analogy (using cosine similarity as your similarity metric).

In [6]:
def load_analogy_qs(file_path):
    analogy_qs = []
    with open(file_path, 'r') as file:
        next(file)  # Skip the header line
        for line in file:
            parts = line.strip().split('\t')
            analogy = parts[0].split(',')
            choices = parts[1].split(',')
            analogy_qs.append({
                "analogy": analogy,
                "choices": choices
            })
    return analogy_qs

analogy_qs = load_analogy_qs('./analogies.csv')

In [7]:
def find_analogy_word(a, b, aa, choices, embeddings):
    '''
    Find the word bb that completes the analogy: a:b -> aa:bb
    A classic example would be: man:king -> woman:queen

    Note: use cosine similarity as your similarity metric

    Arguments:
        a, b, aa (str): words in the analogy described above
        choices (List[str]): list of strings for possible answer
        embeddings (Dict[str, np.array]): map of words to their embeddings

    Returns:
        answer: the word bb that completes the analogy
    '''
    a_vec = embeddings.get_vector(a)
    b_vec = embeddings.get_vector(b)
    aa_vec = embeddings.get_vector(aa)
    analogy_vec = b_vec - a_vec + aa_vec

    max_similarity = 0
    answer = None
    
    for choice in choices:
        choice_vec = embeddings.get_vector(choice)
        similarity = cosine_similarity(analogy_vec, choice_vec)
        if similarity > max_similarity:
            max_similarity = similarity
            answer = choice
    
    return answer

In [8]:
class Part2_Runner():
    def __init__(self, find_analogy_word):        
        self.find_analogy_word = find_analogy_word

        # load embeddings
        self.embeddings = KeyedVectors.load_word2vec_format("./w2v-negative300.txt", binary=False)

        # load questions
        self.analogy_qs = load_analogy_qs('./analogies.csv')

    def evaluate(self, print_q=False):
        '''
        Calculates accuracy on part 2.
        '''
        print ('Part 2: Analogies')
        print ('------------------')

        n_correct = 0
        for i, item in enumerate(self.analogy_qs):
            a, b, aa, true_bb = item['analogy']
            choices = item['choices']
            ans = self.find_analogy_word(a, b, aa, choices, self.embeddings)
            if ans == true_bb: n_correct += 1

            if print_q:
                print ('%d. %s is to %s as %s is to ___?' % (i+1, a, b, aa))
                print ('    a) %s\n    b) %s\n    c) %s\n    d) %s' % tuple(choices))
                print ('You answered: %s' % ans)
                print('Correct answer is: %s\n' % true_bb)

        acc = n_correct / len(self.analogy_qs)
        print ('accuracy: %.5f' % acc)
        print (' ')
        return acc


You should get an accuracy of 72%.

In [9]:
part2 = Part2_Runner(find_analogy_word)
part2.evaluate(True)  # To only print the scores, pass in False as an argument

Part 2: Analogies
------------------
1. king is to queen as man is to ___?
    a) wife
    b) woman
    c) head
    d) ruler
You answered: woman
Correct answer is: woman

2. dog is to puppy as cat is to ___?
    a) kitten
    b) puppy
    c) feline
    d) mouse
You answered: puppy
Correct answer is: kitten

3. father is to son as mother is to ___?
    a) girl
    b) wife
    c) daughter
    d) queen
You answered: daughter
Correct answer is: daughter

4. listen is to hear as look is to ___?
    a) taste
    b) see
    c) feel
    d) think
You answered: see
Correct answer is: see

5. doctor is to hospital as lawyer is to ___?
    a) court
    b) restaurant
    c) museum
    d) library
You answered: court
Correct answer is: court

6. good is to great as bad is to ___?
    a) better
    b) worse
    c) okay
    d) sad
You answered: sad
Correct answer is: worse

7. stove is to kitchen as tub is to ___?
    a) closet
    b) bedroom
    c) bathroom
    d) pantry
You answered: bathroom
Correct

0.72

### Part 3: Sentence Similarity

For this section, your goal is to answer questions of the form:
```
True/False: the following two sentences are semantically similar:
  1. he later learned that the incident was caused by the concorde's sonic boom
  2. he later found out the alarming incident had been caused by concorde's powerful sonic boom
```
Take in 2 sentences as input, and output either true or false. To do this, create a sentence embedding that represents each sentence in vector form, then apply cosine similarity to compute the similarity between the two sentence embeddings. If they have a high enough similarity, output "True" and otherwise "False".

To accomplish this, first turn each sentence into a single vector embedding. There are a few different this can be done and for this assignment, we will consider two approaches:

1. **Simple sum**: Sum the word embeddings of each individual word in the sentence. This resulting vector is the sentence embedding vector.
1. **Sum with POS weighting**: Take a weighted sum of the individual word vectors, where the weighting depends on the part of speech (POS) of that given word. Each POS (verb, noun, adjective, etc) has a different scalar weight associated with it. We multiply each word vector by the scalar weight associated with its part of speech, then sum these weighted vectors.

Implement the following 2 functions:

1. `get_embedding()`: given a sentence (string), return the sentence embedding (vector). The function also takes in the parameter `use_POS`:
if `use_POS` is false (regular case), use sum of the word embeddings for each word in the sentence (ignoring words that do not appear in our vocabulary).
if use_POS is true, use a weighted sum, where each word is weighted by a scalar that depends on its part of speech tag.
1. `get_similarity()`: given two sentences, find the cosine similarity between their corresponding sentence embeddings.

Helpful hints:

We’ve given you a map `POS_weights` that maps part of speech tags to their associated weight. For example, `POS_weights['NN'] = 0.8` (where NN is the POS tag for noun).

**You should skip words that are not in the embeddings or have a POS tag that is not in POS_weights.**

To get a list of all the words in the sentence, use nltk's word_tokenize function.

```
>>> sentence = "this is a sentence"
>>> word_tokens = word_tokenize(sentence)
>>> word_tokens
['this', 'is', 'a', 'sentence']
```

To get the POS tags for each word in a sentence, you can use `nltk.pos_tag`. To use it, you provide a list of words in a sentence, and it returns a list of tuples, where the first element is the word and the second is its corresponding POS tag. 

Make sure that you pass in the entire sentence to a single call to `nltk.pos_tag`; do not call `nltk.pos_tag` separately on each word in the sentence. This is because some words can be multiple parts of speech (for example, "back" can be a noun or a verb). Passing in the entire sentence allows for more context to figure out what POS tag a word should have.

```
>>> tagged_words = nltk.pos_tag(word_tokens)
>>> tagged_words
[('this', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sentence', 'NN')]`
```

In [10]:
# You will use nltk for tokenizing and tagging
import nltk
from nltk.tokenize import word_tokenize

In [11]:
# Run this cell to download the nltk tagger
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\royef\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\royef\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
def load_sentence_sim_qs(filename):
    '''
    input line:
        label   s1  s2
    
    returns list of tuples, each of the form:
        (label, s1, s2)
    '''
    samples = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            label_str, s1, s2 = line.split('\t')
            label = int(label_str)
            samples.append((label, s1.strip(), s2.strip()))
    return samples

In [13]:
def get_embedding(s, embeddings, use_POS=False, POS_weights=None):
    '''
    Returns vector embedding for a given sentence.

    Arguments:
        s (str): sentence
        embeddings (Dict[str, np.array]): map of words (strings) to their embeddings (np.array)
        use_POS (bool): flag indicating whether to use POS weightings when
            calculating the sentence embedding
        POS_weights (Dict[str, float]): map of part of speech tags (strings) to their weights (floats),
            it is only to be used if the use_POS flag is true

    Returns:
        embed (np.array): vector embedding of sentence s
    '''
    embed = np.zeros(embeddings.vector_size)
    tokens = word_tokenize(s)
    
    if use_POS:
        tagged_tokens = nltk.pos_tag(tokens)
        for word, pos in tagged_tokens:
            if word in embeddings:
                pos_tag = pos[:2]  # Use only the first two characters of the POS tag
                weight = POS_weights.get(pos_tag, 1.0)
                embed += weight * embeddings[word]
    else:
        for word in tokens:
            if word in embeddings:
                embed += embeddings[word]
    
    return embed

def get_similarity(s1, s2, embeddings, use_POS, POS_weights=None):
    '''
    Given 2 sentences and the embeddings dictionary, convert the sentences
    into sentence embeddings and return the cosine similarity between them.

    Arguments:
        s1, s2 (str): sentences
        embeddings (Dict[str, np.array]): map of words (strings) to their embeddings (np.array)
        use_POS (bool): flag indicating whether to use POS weightings when
            calculating the sentence embedding
        POS_weights (Dict[str, float]): map of part of speech tags (strings) to their weights (floats),
            it is only to be used if the use_POS flag is true

    Returns:
        similarity (float): cosine similarity of the two sentence embeddings
    '''
    similarity = 0
    embed1 = get_embedding(s1, embeddings, use_POS, POS_weights)
    embed2 = get_embedding(s2, embeddings, use_POS, POS_weights)
    
    similarity = cosine_similarity(embed1, embed2)
    
    return similarity

In [14]:
class Part3_Runner():
    def __init__(self, get_similarity):        
        self.get_similarity = get_similarity

        # load embeddings
        self.embeddings = KeyedVectors.load_word2vec_format("./w2v-negative300.txt", binary=False)

        # load questions
        self.sentence_sim_qs = load_sentence_sim_qs('./sentences.csv')

    def evaluate(self, print_q=False):
        '''
        Calculates accuracy of part 3.
        '''
        print ('Part 3: Sentence similarity!')
        print ('----------------------------')

        acc_base = self.get_sentence_sim_accuracy(self.embeddings, self.sentence_sim_qs, use_POS=False, print_q=print_q)
        acc_POS = self.get_sentence_sim_accuracy(self.embeddings, self.sentence_sim_qs, use_POS=True, print_q=print_q)

        print ('accuracy (regular): %.5f' % acc_base)
        print ('accuracy with POS weighting: %.5f' % acc_POS)
        print (' ')
        return acc_base, acc_POS

    def get_sentence_sim_accuracy(self, embeddings, sentence_sim_qs, use_POS, print_q=False):
        '''
        Helper function to compute sentence similarity classification accuracy.
        '''
        THRESHOLD = 0.95
        POS_weights = self.load_pos_weights_map() if use_POS else None
        if print_q:
            type_str = 'with POS weighting' if use_POS else 'regular'
            print ('Answering part 3 (%s)...' % type_str)

        n_correct = 0
        for i, (label, s1, s2) in enumerate(sentence_sim_qs):
            sim = self.get_similarity(s1, s2, embeddings, use_POS, POS_weights)
            pred = 1 if sim > THRESHOLD else 0
            if pred == label: n_correct += 1

            if print_q:
                print ('%d. True/False: the following two sentences are semantically similar:' % (i+1))
                print ('     1. %s' % s1)
                print ('     2. %s' % s2)
                print ('You answered: %r' % (True if pred == 1 else False))
                print ('Correct Answer: %r\n' % (True if label == 1 else False))
                
        acc = n_correct / len(sentence_sim_qs)
        return acc

    def load_pos_weights_map(self):
        '''
        Helper that loads the POS tag weights for part 3
        '''
        d = {}
        with open("./pos_weights.txt") as f:
            for line in f:
                pos, weight = line.split()
                d[pos] = float(weight)
        return d

You should get an accuracy of 63% without POS weights, and 63.5% with.

In [15]:
part3 = Part3_Runner(get_similarity)
part3.evaluate(True) # To only print the scores, pass in False as an argument

Part 3: Sentence similarity!
----------------------------
Answering part 3 (regular)...
1. True/False: the following two sentences are semantically similar:
     1. one woman is measuring another woman's ankle.
     2. a woman measures another woman's ankle.
You answered: False
Correct Answer: True

2. True/False: the following two sentences are semantically similar:
     1. a man is cutting an onion.
     2. a man cuts an onion.
You answered: False
Correct Answer: True

3. True/False: the following two sentences are semantically similar:
     1. a young woman is putting stickers all over her face.
     2. a woman is applying stickers to her face.
You answered: False
Correct Answer: True

4. True/False: the following two sentences are semantically similar:
     1. a woman is dancing in the rain.
     2. a woman dances in the rain out side.
You answered: False
Correct Answer: True

5. True/False: the following two sentences are semantically similar:
     1. the man is kissing and huggin

73. True/False: the following two sentences are semantically similar:
     1. i think it's fine to ask this question.
     2. i think it is okay to ask the question.
You answered: True
Correct Answer: True

74. True/False: the following two sentences are semantically similar:
     1. there's not a lot you can do about that.
     2. i'm afraid there's not really a lot you can do.
You answered: False
Correct Answer: True

75. True/False: the following two sentences are semantically similar:
     1. you answered your own question.
     2. you've answered your own question already.
You answered: False
Correct Answer: True

76. True/False: the following two sentences are semantically similar:
     1. can you do this?
     2. can you do it?
You answered: True
Correct Answer: True

77. True/False: the following two sentences are semantically similar:
     1. how do you do that?
     2. how to do that?
You answered: False
Correct Answer: True

78. True/False: the following two sentences are se

(0.63, 0.64)