### Download the Required Resources


##### Import the libraries

In [96]:
import gensim.downloader as g1
from transformers import BertModel, BertTokenizer 
import torch
import numpy as np
from nltk.corpus import wordnet as wn
import random
import pandas as pd


#### Practice the APis

##### Practice Word2Vec

In [13]:
w2vModel = g1.load("word2vec-google-news-300");

In [11]:
sim = w2vModel.most_similar('board')
vec1 = w2vModel.get_vector("board") 
vec2 = w2vModel.get_vector("committee") 
print(sim)
print(vec1)
print(vec2)

[('Board', 0.673071563243866), ('directors', 0.6475343704223633), ('trustees', 0.6403145790100098), ('baord', 0.5922820568084717), ('Trustees', 0.5866842269897461), ('Governing_Board', 0.5753400325775146), ('Theresa_Colaizzi', 0.5700287818908691), ('Jane_Gallucci', 0.5602066516876221), ('boards', 0.5594165325164795), ('Pat_Deutschman', 0.5546656250953674)]
[-0.14453125 -0.25976562 -0.01611328 -0.01074219 -0.01281738 -0.34765625
  0.10839844  0.00340271  0.07080078  0.04199219  0.0456543  -0.14160156
 -0.03808594 -0.19335938 -0.30273438  0.09619141  0.0703125  -0.11425781
 -0.02709961  0.01306152 -0.09863281  0.22070312  0.00118256  0.1328125
  0.02783203  0.14453125 -0.21386719  0.30664062 -0.20117188 -0.29101562
  0.07080078 -0.07861328 -0.07958984 -0.06738281  0.17675781 -0.23730469
  0.171875    0.31445312  0.13378906 -0.12109375 -0.09423828  0.13671875
  0.0390625  -0.09619141  0.07666016 -0.12695312  0.19140625 -0.04907227
  0.04589844  0.21679688 -0.00778198  0.08886719  0.055664

##### Practice Bert

In [4]:
# Load pre-trained BERT model and tokenizer 
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [5]:
def pre_process_sentence(sentences: list):
    inputs = tokenizer(sentences, padding=True, return_tensors="pt")
    return inputs

In [6]:
def get_bert_embeddings(sentences, model):
    """
    Get BERT embeddings for a list of sentences.
    
    Args:
    sentences (list): A list of strings, each representing a sentence.
    model_name (str): The name of the pre-trained BERT model to use.
    
    Returns:
    numpy.ndarray: A 3D array of shape (num_sentences, max_sentence_length, embedding_size)
                   containing the embeddings for each token in each sentence.
    """
   
    # Tokenize the input sentences and convert to PyTorch tensors
    inputs = pre_process_sentence(sentences)
    
    # Forward pass through the BERT model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the hidden states (embeddings) of the last layer
    last_hidden_states = outputs.last_hidden_state
    
    # Convert tensor to numpy array
    embeddings = last_hidden_states.numpy()
    
    return embeddings

In [7]:
# Check an example with two sentences
# Example usage:
sentences = ["This dog is big", "This dog is lovely"]
embeddings = get_bert_embeddings(sentences, model)
print(embeddings.shape)  # Should print (2, max_sentence_length_of_the_longest_sent, 768)

(2, 6, 768)


In [8]:
# Print each vector for each token in each sentence
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    for j, token in enumerate(tokenizer.tokenize(sentence)):
        print(f"  Token {j+1}: {token}")
        print(f"  Vector: {embeddings[i][j]}")
    print("-----")

Sentence 1: This dog is big
  Token 1: this
  Vector: [-1.22677162e-01  1.71388537e-01  1.38660222e-01 -1.25295550e-01
 -1.23378061e-01 -3.15118462e-01  3.81148607e-01  3.66328418e-01
 -1.53208971e-01 -2.16480389e-01  2.66019776e-02 -8.57386291e-02
 -1.29341215e-01  1.15167096e-01  1.53766591e-02  6.82867318e-02
 -1.28409401e-01  3.77842188e-01  1.31885767e-01  1.34913558e-02
 -1.12264499e-01 -2.79022753e-01  1.36035625e-02 -2.86049068e-01
  7.36279879e-04  8.30132291e-02 -5.26445471e-02 -1.37320757e-01
  1.02473967e-01  1.85041785e-01  2.88437396e-01  8.71096849e-02
 -3.10474727e-02  1.65917277e-01  5.35704121e-02 -1.11818470e-01
  1.77219182e-01  2.41990462e-02  3.89830582e-03  1.45159423e-01
  2.20687799e-02  1.54288813e-01  3.11056860e-02  6.85374960e-02
 -2.19790176e-01 -1.80895552e-01 -2.30779195e+00 -8.68935883e-02
  5.66399544e-02 -3.27564716e-01  1.84363574e-01 -2.74686664e-02
  1.11568775e-02  3.47044647e-01  1.00632526e-01  1.01709679e-01
 -2.12604910e-01  5.01872480e-01 -3.

### Part 1
One “objective” way of analyzing the similarities generated from a word embedding is to use existing human encoded knowledge. To this end, WordNet serves as a possible source for comparison.
For instance, one can pick a synset S, and look at all the words that are associated with S, and calculate the average similarity (of their corresponding vectors) between pairs of such words. Ideally these values should be high, as these words are supposedly “similar” (have a sense that is “the same”).
Once can also argue that if two synsets S1, S2 are “far apart” (e.g. having low path similarity), then if we pick a word associated with S1, and another associated with S2, their corresponding vectors’ similarity should be low.
To see if that is true, you should implement the following function and put it in “hw2.py” (when I say similarity below, I mean the (cosine)-similarity between the vectors (in the embedding) corresponds to the words.

#### Synonym Set Similarities Implementation
Where model is the model that is loaded from Word2Vec, and sset is a synset from
WordNet
- What the function returns depends on how many words are associated with sset
- If sset has only one word, you should return an empty list
- If sset has two words, you should return a list of one number, which is the similarity of the two vectors in the model corresponding to the two words
- If sset has three or more words, then you should return a list of 4 numbers: [avg, sd, min, max]
- avg is the average similarity between words in sset
- sd is the standard deviation of the similarities between words in sset
- min, max are the minimum and maximum similarity respectively


In [101]:
def synsetSimValue(model_wv, words):
    """
    Calculate similarity statistics for a list of words using pre-trained word vectors.

    Args:
    model_wv (gensim.models.keyedvectors.KeyedVectors): The word vectors from a Word2Vec model.
    words (list): A list of words (strings).

    Returns:
    list: A list containing similarity statistics (average, standard deviation, minimum, maximum).
    """

    # Check if the input is a list, if not but a string of len 1 convert it to list one word
    if not isinstance(words, list):
        if isinstance(words, str):
            words = [words]
        else:
            raise ValueError("The input must be a list of words or a single word.")
        
     # Filter words that are not in the model's vocabulary
    words = [word for word in words if word in model.key_to_index]
    # print(f"{len(words)} words out of {len(words)} are in the vocabulary.")

    # Check the number of words in the synset
    num_words = len(words)

    if num_words == 1:
        return []
    elif num_words == 2:
        return [model.similarity(words[0], words[1])]
    else:
        # Calculate the similarity between all pairs of words in the synset
        sim_values = []
        for i in range(num_words):
            for j in range(i+1, num_words):
                sim_values.append(model_wv.similarity(words[i], words[j]))

        # Calculate and return the statistics
         # Calculate statistics
        avg = np.mean(sim_values)
        sd = np.std(sim_values)
        min_sim = np.min(sim_values)
        max_sim = np.max(sim_values)

        return [avg, sd, min_sim, max_sim]

##### Test synsetsim Function.

In [42]:
def test_with_one_word(model, word):
    ''' Test with one word'''
    print("Test with 1 word, expect []")
    try:
        assert synsetSimValue(model, word) == []
        print("Pass")
        print(f"function returned {synsetSimValue(model, word)}")
    except AssertionError as e:
        print(f"Failed with error {e}")

def test_with_two_words(model, word1, word2):
    ''' 
    If sset has two words, you should return a list of one number, which is the similarity of the two vectors in the model corresponding to the two words
    '''
    print("Test with 2 words, expect [number]")
    try:
        assert len(synsetSimValue(model, [word1, word2])) == 1
        print("Passed........\n")
        print(f"similarity between {word1} and {word2} is {synsetSimValue(model, [word1, word2])}")
    except AssertionError as e:
        print(f"Failed with error {e}")

def test_with_more_than_two_words(model, words):
    """
    Test the synsetSimValue function with a list of more than two words.

    Args:
    model: The Word2Vec model.
    words (list): A list of words.

    Prints the result of the test and the statistics if the test passes.
    """
    print("Test with more than 2 words, expect [average, standard deviation, minimum, maximum]")
    stats = synsetSimValue(model, words)
    try:
        assert len(stats) == 4
        print("Passed, printing statistics........\n")
        avg, sd, min_sim, max_sim = stats
        print(f"Average: {avg:.4f}")
        print(f"Standard deviation: {sd:.4f}")
        print(f"Minimum: {min_sim:.4f}")
        print(f"Maximum: {max_sim:.4f}")
    except AssertionError:
        print("Failed")


In [36]:
model = w2vModel
# Example usage
test_words = ['dog', 'canine', 'puppy']

In [37]:
print("Test with one word")
test_with_one_word(model, test_words[0])

Test with one word
Test with 1 word, expect []
1 words out of 1 are in the vocabulary.
Pass
1 words out of 1 are in the vocabulary.
function returned []


In [43]:
print("Test with two words")
test_with_two_words(model, test_words[0], test_words[1])

Test with two words
Test with 2 words, expect [number]
2 words out of 2 are in the vocabulary.
Passed........

2 words out of 2 are in the vocabulary.
similarity between dog and canine is [0.69182897]


In [44]:
print("Test with more than two words")
test_with_more_than_two_words(model, test_words)

Test with more than two words
Test with more than 2 words, expect [average, standard deviation, minimum, maximum]
3 words out of 3 are in the vocabulary.
Passed, printing statistics........

Average: 0.6843
Standard deviation: 0.1064
Minimum: 0.5503
Maximum: 0.8106


#### Cross Synonym Set Implementation
- Where model is the model that is loaded from Word2Vec, and sset1, sset2 are synset
sfrom WordNet
- What the function returns depends on how many words are associated with sset1 and
sset2
- If both sset1 and sset2 has one word, you should return a list of one number,
which is the similarity of the two vectors in the model corresponding to the two
words
- Otherwise you should return a list of 4 numbers: [avg, sd, min, max]
- avg is the average similarity between pair of words, one from sset1 and the other from sset2
- sd is the standard deviation of the similarities described above
- min, max are the minimum and maximum similarity respectively as
described above.

In [102]:
def crossSynsetSimValue(model, words1, words2):
    """
    Calculate similarity statistics for pairs of words from two different sets using a Word2Vec model.

    Args:
    model (gensim.models.Word2Vec): The Word2Vec model.
    words1 (list): A list of words from the first set.
    words2 (list): A list of words from the second set.

    Returns:
    list: A list containing similarity statistics (average, standard deviation, minimum, maximum).
          If both sets have only one word, returns a list with the similarity between the two words.
    """
    # Filter words that are not in the model's vocabularry
    words1 = [word for word in words1 if word in model.key_to_index]
    words2 = [word for word in words2 if word in model.key_to_index]
    # print(f"{len(words1)} words out of {len(words1)} are in the vocabulary.")
    # print(f"{len(words2)} words out of {len(words2)} are in the vocabulary.")

    # return similarity between two words if both sets have only one word
    if len(words1) == 1 and len(words2) == 1:
        return [model.similarity(words1[0], words2[0])]
    else:
        # Calculate similarities between all pairs of words, one from each set
        similarities = []
        for word1 in words1:
            for word2 in words2:
                similarities.append(model.similarity(word1, word2))

        # Calculate statistics
        avg = np.mean(similarities)
        sd = np.std(similarities)
        min_sim = np.min(similarities)
        max_sim = np.max(similarities)

        return [avg, sd, min_sim, max_sim]

##### Test Cross Synset Sample Sentences

In [53]:
def test_crossSyn_with_one_word(model, sset1, sset2):
    ''' Test with one word'''
    print("Test with 1 word, expect []")
    sim = crossSynsetSimValue(model, sset1, sset2)
    try:
        assert len(sim) == 1
        print("Pass")
        print(f"function returned simarilary between {sset1[0]} and {sset2[0]} as: {sim}")
    except AssertionError as e:
        print(f"Failed with error {e}")

def test_crossSyn_statistic(model, sset1, sset2):
    ''' Test with more than 1 words in each set'''
    print("Test with more than 2 words, expect [average, standard deviation, minimum, maximum]")
    stats = crossSynsetSimValue(model, sset1, sset2)
    try:
        assert len(stats) == 4
        print("Passed, printing statistics........\n")
        avg, sd, min_sim, max_sim = stats
        print(f"Average: {avg:.4f}")
        print(f"Standard deviation: {sd:.4f}")
        print(f"Minimum: {min_sim:.4f}")
        print(f"Maximum: {max_sim:.4f}")
    except AssertionError as e:
        print(f"Failed with error {e}")

In [74]:
sset1 = ['dog', 'cat', 'puppy']
sset2 = ['apple', 'orange', 'tomato']

In [76]:
print("Test with one word")
test_crossSyn_with_one_word(model, ['cat'], ['dog'])

Test with one word
Test with 1 word, expect []
1 words out of 1 are in the vocabulary.
1 words out of 1 are in the vocabulary.
Pass
function returned simarilary between cat and dog as: [0.76094574]


In [77]:
print("Test with more than 2 words")
test_crossSyn_statistic(model, sset1, sset2)

Test with more than 2 words
Test with more than 2 words, expect [average, standard deviation, minimum, maximum]
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
Passed, printing statistics........

Average: 0.1773
Standard deviation: 0.0520
Minimum: 0.0791
Maximum: 0.2515


#### Pick Synset from NLTK WordNet API
Pick 32 synsets from the noun hypernyum-hyponym tree, eight of them from level 4 (assume the root is level-0), eight from level 6, eight from level 8, and eight from level 10. You can either do this manually, or you can write a program using nltk’s wordnet api to get those. For synsets


This code defines a function get_synsets_at_level that retrieves a specified number of synsets at a given level in the noun hypernym-hyponym tree. It then uses this function to get 8 synsets each at levels 4, 6, 8, and 10, and combines them into a list of 32 synsets. Finally, it prints out the selected synsets along with their definitions.

In [91]:
def get_synsets_at_level(model_wv, level, num_synsets, min_words=3):
    """
    Get a specified number of synsets at a given level in the noun hypernym-hyponym tree,
    ensuring that the synsets contain words present in the Word2Vec model's vocabulary.

    Args:
    model_wv (gensim.models.keyedvectors.KeyedVectors): The word vectors from a Word2Vec model.
    level (int): The level in the tree (0 is the root).
    num_synsets (int): The number of synsets to retrieve at the given level.
    min_words (int): Minimum number of words in each synset that must be in the model's vocabulary.

    Returns:
    list: A list of synsets at the specified level.
    """
    synsets = []
    current_level_synsets = [wn.synset('entity.n.01')]  # Start with the root

    # Traverse the tree to the specified level
    for _ in range(level):
        next_level_synsets = []
        for s in current_level_synsets:
            next_level_synsets.extend(s.hyponyms())
        current_level_synsets = next_level_synsets

    # Filter synsets that have at least 'min_words' words in the model's vocabulary
    current_level_synsets = [
        s for s in current_level_synsets
        if len([word for word in get_words_from_synset(s) if word in model_wv.key_to_index]) >= min_words
    ]

    # Shuffle the synsets to get a random sample
    random.shuffle(current_level_synsets)

    # Select the first 'num_synsets' synsets, ensuring they don't share the same parent/grandparent
    selected_synsets = []
    for s in current_level_synsets:
        if len(selected_synsets) >= num_synsets:
            break
        if not any(s.lowest_common_hypernyms(ss)[0] in [ss, s] for ss in selected_synsets):
            selected_synsets.append(s)

    return selected_synsets


##### Download Wordnet

In [83]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tango.tew/nltk_data...


True

##### Test Synset picks

In [92]:
# Get synsets at different levels
level_4_synsets = get_synsets_at_level(model, 4, 8)  # Get 8 synsets at level 4 (less specific than level 6)
print("Level 4 Synsets:")
for i, sset in enumerate(level_4_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")

level_6_synsets = get_synsets_at_level(model, 6, 8)  # Get 8 synsets at level 6 (less specific than level 8)
print("\nLevel 6 Synsets:")
for i, sset in enumerate(level_6_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")

level_8_synsets = get_synsets_at_level(model, 8, 8)  # Get 8 synsets at level 8 (less specific than level 10)
print("\nLevel 8 Synsets:")
for i, sset in enumerate(level_8_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")

level_10_synsets = get_synsets_at_level(model, 10, 8)  # Get 8 synsets at level 10
print("\nLevel 10 Synsets:")
for i, sset in enumerate(level_10_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")


Level 4 Synsets:
  Synset 1: Synset('negativity.n.02') - characterized by habitual skepticism and a disagreeable tendency to deny or oppose or resist suggestions or commands
  Synset 2: Synset('adversary.n.01') - someone who offers opposition
  Synset 3: Synset('guidance.n.01') - something that provides direction or advice as to a decision or course of action
  Synset 4: Synset('juju.n.02') - a charm superstitiously believed to embody magical powers
  Synset 5: Synset('eloquence.n.01') - powerful and effective language
  Synset 6: Synset('curdling.n.01') - the process of forming semisolid lumps in a liquid
  Synset 7: Synset('being.n.01') - the state or fact of existing
  Synset 8: Synset('column.n.04') - anything that approximates the shape of a column or tower

Level 6 Synsets:
  Synset 1: Synset('insulation.n.01') - the state of being isolated or detached
  Synset 2: Synset('dunce.n.01') - a stupid person; these words are used to express a low opinion of someone's intelligence
  Syn

##### 2. For each synset, pass it to synsetSimValue(model, sset) to collect the results.

In [95]:
# Define a function to extract words from a synset
def get_words_from_synset(synset):
    return [lemma.name() for lemma in synset.lemmas()]

# Collect results for each synset
results = {}
for level, synsets in zip([4, 6, 8, 10], [level_4_synsets, level_6_synsets, level_8_synsets, level_10_synsets]):
    results[level] = []
    for synset in synsets:
        words = get_words_from_synset(synset)
        sim_values = synsetSimValue(model, words)
        results[level].append(sim_values)

# Print the results
for level, sim_values in results.items():
    print(f"\nLevel {level} Synsets Similarity Values:")
    for i, values in enumerate(sim_values):
        print(f"  Synset {i}: {values}")

3 words out of 3 are in the vocabulary.
5 words out of 5 are in the vocabulary.
4 words out of 4 are in the vocabulary.
4 words out of 4 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
11 words out of 11 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
5 words out of 5 are in the vocabulary.
4 words out of 4 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
5 words out of 5 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
4 words out of 4 are in the vocabulary.
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary

##### 3. Use a table like below to present the results:

In [99]:
def create_synset_similarity_table(level_synsets, model):
    # Define the columns for the DataFrame
    columns = ['Synset level', 'Synset ID', 'Words in that synset', 'Average similarity', 'Standard Deviation', 'Minimum', 'Maximum']
    data = []

    for level, synsets in level_synsets.items():
        for synset in synsets:
            words = get_words_from_synset(synset)
            # Filter words that are in the model's vocabulary
            words_in_vocab = [word for word in words if word in model.key_to_index]
            sim_values = synsetSimValue(model, words_in_vocab)
            if sim_values:
                avg, sd, min_sim, max_sim = sim_values
            else:
                avg, sd, min_sim, max_sim = [0, 0, 0, 0]
            data.append([level, synset.name(), ', '.join(words_in_vocab), avg, sd, min_sim, max_sim])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df

In [104]:
# Assuming you have the Word2Vec model and the level_synsets dictionary
level_synsets = {
    4: level_4_synsets,
    6: level_6_synsets,
    8: level_8_synsets,
    10: level_10_synsets
}

similarity_table = create_synset_similarity_table(level_synsets, model)
similarity_table

Unnamed: 0,Synset level,Synset ID,Words in that synset,Average similarity,Standard Deviation,Minimum,Maximum
0,4,negativity.n.02,"negativity, negativeness, negativism",0.624178,0.07497,0.567598,0.730119
1,4,adversary.n.01,"adversary, antagonist, opponent, opposer, resi...",0.335335,0.119968,0.211449,0.569245
2,4,guidance.n.01,"guidance, counsel, counseling, direction",0.191985,0.067588,0.05628,0.254502
3,4,juju.n.02,"juju, voodoo, hoodoo, fetish",0.302415,0.135633,0.101258,0.563533
4,4,eloquence.n.01,"eloquence, fluency, smoothness",0.398152,0.052708,0.349843,0.471468
5,4,curdling.n.01,"curdling, clotting, coagulation",0.413783,0.124798,0.319528,0.590134
6,4,being.n.01,"being, beingness, existence",0.196695,0.072245,0.135599,0.298162
7,4,column.n.04,"column, tower, pillar",0.16914,0.058829,0.095845,0.239878
8,6,insulation.n.01,"insulation, insularity, detachment",0.16979,0.11035,0.072386,0.324086
9,6,dunce.n.01,"dunce, dunderhead, numskull, blockhead, bonehe...",0.343477,0.180263,0.033977,0.609986


####4 Cross Synset Similarity
##### 4. Consider the synsets you selected in step 1. For each level, form 8 pairs of synsets (each synset participate in two pairs).

##### 5. For each pair, pass it to crossSynsetSimValue(model, sset1, sset2) and collect the results

##### 6. Use a table like below to present the results (no need to list the individual words in this table):

##### 7. Use the two tables, to verify/contradict the arguments made in the first 3 paragraphs in the section. You may want to do additional analysis of the numbers to achieve it. Your argument may say “the results depends on the level of the synset....”