### Download the Required Resources


##### Import the libraries

In [2]:
import gensim.downloader as g1
from transformers import BertModel, BertTokenizer 
import torch
import numpy as np
from nltk.corpus import wordnet as wn
import random
import pandas as pd
import json 
from scipy.spatial.distance import cosine
from itertools import combinations


  from .autonotebook import tqdm as notebook_tqdm


#### Practice the APis

##### Practice Word2Vec

In [11]:
w2vModel = g1.load("word2vec-google-news-300");
model = w2vModel

In [11]:
sim = w2vModel.most_similar('board')
vec1 = w2vModel.get_vector("board") 
vec2 = w2vModel.get_vector("committee") 
print(sim)
print(vec1)
print(vec2)

[('Board', 0.673071563243866), ('directors', 0.6475343704223633), ('trustees', 0.6403145790100098), ('baord', 0.5922820568084717), ('Trustees', 0.5866842269897461), ('Governing_Board', 0.5753400325775146), ('Theresa_Colaizzi', 0.5700287818908691), ('Jane_Gallucci', 0.5602066516876221), ('boards', 0.5594165325164795), ('Pat_Deutschman', 0.5546656250953674)]
[-0.14453125 -0.25976562 -0.01611328 -0.01074219 -0.01281738 -0.34765625
  0.10839844  0.00340271  0.07080078  0.04199219  0.0456543  -0.14160156
 -0.03808594 -0.19335938 -0.30273438  0.09619141  0.0703125  -0.11425781
 -0.02709961  0.01306152 -0.09863281  0.22070312  0.00118256  0.1328125
  0.02783203  0.14453125 -0.21386719  0.30664062 -0.20117188 -0.29101562
  0.07080078 -0.07861328 -0.07958984 -0.06738281  0.17675781 -0.23730469
  0.171875    0.31445312  0.13378906 -0.12109375 -0.09423828  0.13671875
  0.0390625  -0.09619141  0.07666016 -0.12695312  0.19140625 -0.04907227
  0.04589844  0.21679688 -0.00778198  0.08886719  0.055664

##### Practice Bert

In [4]:
# Load pre-trained BERT model and tokenizer 
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)

In [5]:
def pre_process_sentence(sentences: list):
    inputs = tokenizer(sentences, padding=True, return_tensors="pt")
    return inputs

In [6]:
def get_bert_embeddings(sentences, model):
    """
    Get BERT embeddings for a list of sentences.
    
    Args:
    sentences (list): A list of strings, each representing a sentence.
    model_name (str): The name of the pre-trained BERT model to use.
    
    Returns:
    numpy.ndarray: A 3D array of shape (num_sentences, max_sentence_length, embedding_size)
                   containing the embeddings for each token in each sentence.
    """
   
    # Tokenize the input sentences and convert to PyTorch tensors
    inputs = pre_process_sentence(sentences)
    
    # Forward pass through the BERT model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the hidden states (embeddings) of the last layer
    last_hidden_states = outputs.last_hidden_state
    
    # Convert tensor to numpy array
    embeddings = last_hidden_states.numpy()
    
    return embeddings

In [7]:
# Check an example with two sentences
# Example usage:
sentences = ["This dog is big", "This dog is lovely"]
embeddings = get_bert_embeddings(sentences, model_bert)
print(embeddings.shape)  # Should print (2, max_sentence_length_of_the_longest_sent, 768)

(2, 6, 768)


In [8]:
# Print each vector for each token in each sentence
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    for j, token in enumerate(tokenizer.tokenize(sentence)):
        print(f"  Token {j+1}: {token}")
        print(f"  Vector: {embeddings[i][j]}")
    print("-----")

Sentence 1: This dog is big
  Token 1: this
  Vector: [-1.22677162e-01  1.71388537e-01  1.38660222e-01 -1.25295550e-01
 -1.23378061e-01 -3.15118462e-01  3.81148607e-01  3.66328418e-01
 -1.53208971e-01 -2.16480389e-01  2.66019776e-02 -8.57386291e-02
 -1.29341215e-01  1.15167096e-01  1.53766591e-02  6.82867318e-02
 -1.28409401e-01  3.77842188e-01  1.31885767e-01  1.34913558e-02
 -1.12264499e-01 -2.79022753e-01  1.36035625e-02 -2.86049068e-01
  7.36279879e-04  8.30132291e-02 -5.26445471e-02 -1.37320757e-01
  1.02473967e-01  1.85041785e-01  2.88437396e-01  8.71096849e-02
 -3.10474727e-02  1.65917277e-01  5.35704121e-02 -1.11818470e-01
  1.77219182e-01  2.41990462e-02  3.89830582e-03  1.45159423e-01
  2.20687799e-02  1.54288813e-01  3.11056860e-02  6.85374960e-02
 -2.19790176e-01 -1.80895552e-01 -2.30779195e+00 -8.68935883e-02
  5.66399544e-02 -3.27564716e-01  1.84363574e-01 -2.74686664e-02
  1.11568775e-02  3.47044647e-01  1.00632526e-01  1.01709679e-01
 -2.12604910e-01  5.01872480e-01 -3.

### Part 1
One “objective” way of analyzing the similarities generated from a word embedding is to use existing human encoded knowledge. To this end, WordNet serves as a possible source for comparison.
For instance, one can pick a synset S, and look at all the words that are associated with S, and calculate the average similarity (of their corresponding vectors) between pairs of such words. Ideally these values should be high, as these words are supposedly “similar” (have a sense that is “the same”).
Once can also argue that if two synsets S1, S2 are “far apart” (e.g. having low path similarity), then if we pick a word associated with S1, and another associated with S2, their corresponding vectors’ similarity should be low.
To see if that is true, you should implement the following function and put it in “hw2.py” (when I say similarity below, I mean the (cosine)-similarity between the vectors (in the embedding) corresponds to the words.

#### Synonym Set Similarities Implementation
Where model is the model that is loaded from Word2Vec, and sset is a synset from
WordNet
- What the function returns depends on how many words are associated with sset
- If sset has only one word, you should return an empty list
- If sset has two words, you should return a list of one number, which is the similarity of the two vectors in the model corresponding to the two words
- If sset has three or more words, then you should return a list of 4 numbers: [avg, sd, min, max]
- avg is the average similarity between words in sset
- sd is the standard deviation of the similarities between words in sset
- min, max are the minimum and maximum similarity respectively


In [7]:
def synsetSimValue(model_wv, words):
    """
    Calculate similarity statistics for a list of words using pre-trained word vectors.

    Args:
    model_wv (gensim.models.keyedvectors.KeyedVectors): The word vectors from a Word2Vec model.
    words (list): A list of words (strings).

    Returns:
    list: A list containing similarity statistics (average, standard deviation, minimum, maximum).
    """

    # Check if the input is a list, if not but a string of len 1 convert it to list one word
    if not isinstance(words, list):
        if isinstance(words, str):
            words = [words]
        else:
            raise ValueError("The input must be a list of words or a single word.")
        
     # Filter words that are not in the model's vocabulary
    words = [word for word in words if word in model_wv.key_to_index]
    # print(f"{len(words)} words out of {len(words)} are in the vocabulary.")

    # Check the number of words in the synset
    num_words = len(words)

    if num_words == 1:
        return []
    elif num_words == 2:
        return [model_wv.similarity(words[0], words[1])]
    else:
        # Calculate the similarity between all pairs of words in the synset
        sim_values = []
        for i in range(num_words):
            for j in range(i+1, num_words):
                sim_values.append(model_wv.similarity(words[i], words[j]))

        # Calculate and return the statistics
         # Calculate statistics
        avg = np.mean(sim_values)
        sd = np.std(sim_values)
        min_sim = np.min(sim_values)
        max_sim = np.max(sim_values)

        return [avg, sd, min_sim, max_sim]

##### Test synsetsim Function.

In [10]:
def test_with_one_word(model, word):
    ''' Test with one word'''
    print("Test with 1 word, expect []")
    try:
        assert synsetSimValue(model, word) == []
        print("Pass")
        print(f"function returned {synsetSimValue(model, word)}")
    except AssertionError as e:
        print(f"Failed with error {e}")

def test_with_two_words(model, word1, word2):
    ''' 
    If sset has two words, you should return a list of one number, which is the similarity of the two vectors in the model corresponding to the two words
    '''
    print("Test with 2 words, expect [number]")
    try:
        assert len(synsetSimValue(model, [word1, word2])) == 1
        print("Passed........\n")
        print(f"similarity between {word1} and {word2} is {synsetSimValue(model, [word1, word2])}")
    except AssertionError as e:
        print(f"Failed with error {e}")

def test_with_more_than_two_words(model, words):
    """
    Test the synsetSimValue function with a list of more than two words.

    Args:
    model: The Word2Vec model.
    words (list): A list of words.

    Prints the result of the test and the statistics if the test passes.
    """
    print("Test with more than 2 words, expect [average, standard deviation, minimum, maximum]")
    stats = synsetSimValue(model, words)
    try:
        assert len(stats) == 4
        print("Passed, printing statistics........\n")
        avg, sd, min_sim, max_sim = stats
        print(f"Average: {avg:.4f}")
        print(f"Standard deviation: {sd:.4f}")
        print(f"Minimum: {min_sim:.4f}")
        print(f"Maximum: {max_sim:.4f}")
    except AssertionError:
        print("Failed")


In [36]:
model = w2vModel
# Example usage
test_words = ['dog', 'canine', 'puppy']

In [37]:
print("Test with one word")
test_with_one_word(model, test_words[0])

Test with one word
Test with 1 word, expect []
1 words out of 1 are in the vocabulary.
Pass
1 words out of 1 are in the vocabulary.
function returned []


In [43]:
print("Test with two words")
test_with_two_words(model, test_words[0], test_words[1])

Test with two words
Test with 2 words, expect [number]
2 words out of 2 are in the vocabulary.
Passed........

2 words out of 2 are in the vocabulary.
similarity between dog and canine is [0.69182897]


In [44]:
print("Test with more than two words")
test_with_more_than_two_words(model, test_words)

Test with more than two words
Test with more than 2 words, expect [average, standard deviation, minimum, maximum]
3 words out of 3 are in the vocabulary.
Passed, printing statistics........

Average: 0.6843
Standard deviation: 0.1064
Minimum: 0.5503
Maximum: 0.8106


#### Cross Synonym Set Implementation
- Where model is the model that is loaded from Word2Vec, and sset1, sset2 are synset
sfrom WordNet
- What the function returns depends on how many words are associated with sset1 and
sset2
- If both sset1 and sset2 has one word, you should return a list of one number,
which is the similarity of the two vectors in the model corresponding to the two
words
- Otherwise you should return a list of 4 numbers: [avg, sd, min, max]
- avg is the average similarity between pair of words, one from sset1 and the other from sset2
- sd is the standard deviation of the similarities described above
- min, max are the minimum and maximum similarity respectively as
described above.

In [12]:
def crossSynsetSimValue(model, words1, words2):
    """
    Calculate similarity statistics for pairs of words from two different sets using a Word2Vec model.

    Args:
    model (gensim.models.Word2Vec): The Word2Vec model.
    words1 (list): A list of words from the first set.
    words2 (list): A list of words from the second set.

    Returns:
    list: A list containing similarity statistics (average, standard deviation, minimum, maximum).
          If both sets have only one word, returns a list with the similarity between the two words.
    """
    # Filter words that are not in the model's vocabularry
    words1 = [word for word in words1 if word in model.key_to_index]
    words2 = [word for word in words2 if word in model.key_to_index]
    # print(f"{len(words1)} words out of {len(words1)} are in the vocabulary.")
    # print(f"{len(words2)} words out of {len(words2)} are in the vocabulary.")

    # return similarity between two words if both sets have only one word
    if len(words1) == 1 and len(words2) == 1:
        return [model.similarity(words1[0], words2[0])]
    else:
        # Calculate similarities between all pairs of words, one from each set
        similarities = []
        for word1 in words1:
            for word2 in words2:
                similarities.append(model.similarity(word1, word2))

        # Calculate statistics
        avg = np.mean(similarities)
        sd = np.std(similarities)
        min_sim = np.min(similarities)
        max_sim = np.max(similarities)

        return [avg, sd, min_sim, max_sim]

##### Test Cross Synset Sample Sentences

In [53]:
def test_crossSyn_with_one_word(model, sset1, sset2):
    ''' Test with one word'''
    print("Test with 1 word, expect []")
    sim = crossSynsetSimValue(model, sset1, sset2)
    try:
        assert len(sim) == 1
        print("Pass")
        print(f"function returned simarilary between {sset1[0]} and {sset2[0]} as: {sim}")
    except AssertionError as e:
        print(f"Failed with error {e}")

def test_crossSyn_statistic(model, sset1, sset2):
    ''' Test with more than 1 words in each set'''
    print("Test with more than 2 words, expect [average, standard deviation, minimum, maximum]")
    stats = crossSynsetSimValue(model, sset1, sset2)
    try:
        assert len(stats) == 4
        print("Passed, printing statistics........\n")
        avg, sd, min_sim, max_sim = stats
        print(f"Average: {avg:.4f}")
        print(f"Standard deviation: {sd:.4f}")
        print(f"Minimum: {min_sim:.4f}")
        print(f"Maximum: {max_sim:.4f}")
    except AssertionError as e:
        print(f"Failed with error {e}")

In [74]:
sset1 = ['dog', 'cat', 'puppy']
sset2 = ['apple', 'orange', 'tomato']

In [76]:
print("Test with one word")
test_crossSyn_with_one_word(model, ['cat'], ['dog'])

Test with one word
Test with 1 word, expect []
1 words out of 1 are in the vocabulary.
1 words out of 1 are in the vocabulary.
Pass
function returned simarilary between cat and dog as: [0.76094574]


In [77]:
print("Test with more than 2 words")
test_crossSyn_statistic(model, sset1, sset2)

Test with more than 2 words
Test with more than 2 words, expect [average, standard deviation, minimum, maximum]
3 words out of 3 are in the vocabulary.
3 words out of 3 are in the vocabulary.
Passed, printing statistics........

Average: 0.1773
Standard deviation: 0.0520
Minimum: 0.0791
Maximum: 0.2515


#### Pick Synset from NLTK WordNet API
Pick 32 synsets from the noun hypernyum-hyponym tree, eight of them from level 4 (assume the root is level-0), eight from level 6, eight from level 8, and eight from level 10. You can either do this manually, or you can write a program using nltk’s wordnet api to get those. For synsets


This code defines a function get_synsets_at_level that retrieves a specified number of synsets at a given level in the noun hypernym-hyponym tree. It then uses this function to get 8 synsets each at levels 4, 6, 8, and 10, and combines them into a list of 32 synsets. Finally, it prints out the selected synsets along with their definitions.

In [55]:
# Define a function to extract words from a synset
def get_words_from_synset(synset):
    return [lemma.name() for lemma in synset.lemmas()]

def get_synset_level(synset):
    """
    Get the level of a synset in WordNet.

    Args:
    synset (nltk.corpus.reader.wordnet.Synset): A WordNet synset.

    Returns:
    int: The level of the synset in the WordNet hierarchy.
    """
    level = 0
    while synset.hypernyms():
        synset = synset.hypernyms()[0]
        level += 1
    return level

def get_synsets_at_level(model_wv, level, num_synsets, min_words=3, seed=42):
    """
    Get a specified number of synsets at a given level in the noun hypernym-hyponym tree,
    ensuring that the synsets contain words present in the Word2Vec model's vocabulary and are at the correct level.

    Args:
    model_wv (gensim.models.keyedvectors.KeyedVectors): The word vectors from a Word2Vec model.
    level (int): The level in the tree (0 is the root).
    num_synsets (int): The number of synsets to retrieve at the given level.
    min_words (int): Minimum number of words in each synset that must be in the model's vocabulary.
    seed (int): The seed for the random number generator.

    Returns:
    list: A list of synsets at the specified level.
    """
    synsets = []
    all_synsets = list(wn.all_synsets('n'))  # Get all noun synsets

    # Filter synsets that are at the correct level and have at least 'min_words' words in the model's vocabulary
    filtered_synsets = [
        s for s in all_synsets
        if get_synset_level(s) == level and
        len([word for word in get_words_from_synset(s) if word in model_wv.key_to_index]) >= min_words
    ]

    # Set the seed for random number generator
    random.seed(seed)

    # Shuffle the synsets to get a random sample
    random.shuffle(filtered_synsets)

    # Select the first 'num_synsets' synsets
    selected_synsets = filtered_synsets[:num_synsets]

    return selected_synsets

##### Download Wordnet

In [15]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tango.tew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##### Test Synset picks

In [30]:
# Get synsets at different levels
level_4_synsets = get_synsets_at_level(model, 4, 8)  # Get 8 synsets at level 4 (less specific than level 6)
print("Level 4 Synsets:")
for i, sset in enumerate(level_4_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")

level_6_synsets = get_synsets_at_level(model, 6, 8)  # Get 8 synsets at level 6 (less specific than level 8)
print("\nLevel 6 Synsets:")
for i, sset in enumerate(level_6_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")

level_8_synsets = get_synsets_at_level(model, 8, 8)  # Get 8 synsets at level 8 (less specific than level 10)
print("\nLevel 8 Synsets:")
for i, sset in enumerate(level_8_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")

level_10_synsets = get_synsets_at_level(model, 10, 8)  # Get 8 synsets at level 10
print("\nLevel 10 Synsets:")
for i, sset in enumerate(level_10_synsets):
    print(f"  Synset {i + 1}: {sset} - {sset.definition()}")


Level 4 Synsets:
  Synset 1: Synset('guidance.n.01') - something that provides direction or advice as to a decision or course of action
  Synset 2: Synset('hazard.n.01') - a source of danger; a possibility of incurring loss or misfortune
  Synset 3: Synset('dissenter.n.01') - a person who dissents from some established policy
  Synset 4: Synset('narrative.n.01') - a message that tells the particulars of an act or occurrence or course of events; presented in writing or drama or cinema or as a radio or television program
  Synset 5: Synset('package.n.01') - a collection of things wrapped or boxed together
  Synset 6: Synset('beginning.n.02') - the time at which something is supposed to begin
  Synset 7: Synset('inhabitant.n.01') - a person who inhabits a particular place
  Synset 8: Synset('set.n.05') - an unofficial association of people or groups

Level 6 Synsets:
  Synset 1: Synset('dilatation.n.01') - the state of being stretched beyond normal dimensions
  Synset 2: Synset('generator

In [34]:
print(get_synset_level(wn.synset('knockout.n.02')))  # Should print 5

10


##### 2. For each synset, pass it to synsetSimValue(model, sset) to collect the results.

In [35]:
# Collect results for each synset
results = {}
for level, synsets in zip([4, 6, 8, 10], [level_4_synsets, level_6_synsets, level_8_synsets, level_10_synsets]):
    results[level] = []
    for synset in synsets:
        words = get_words_from_synset(synset)
        sim_values = synsetSimValue(model, words)
        results[level].append(sim_values)

# Print the results
for level, sim_values in results.items():
    print(f"\nLevel {level} Synsets Similarity Values:")
    for i, values in enumerate(sim_values):
        print(f"  Synset {i}: {values}")


Level 4 Synsets Similarity Values:
  Synset 0: [0.19198452, 0.0675881, 0.05628027, 0.2545016]
  Synset 1: [0.36985362, 0.110029414, 0.20530821, 0.54964393]
  Synset 2: [0.23820046, 0.09675737, 0.053950276, 0.38784942]
  Synset 3: [0.5221036, 0.124665916, 0.3561672, 0.68530774]
  Synset 4: [0.28917667, 0.08000256, 0.17092088, 0.41898152]
  Synset 5: [0.18865238, 0.13750823, -0.031129314, 0.5463147]
  Synset 6: [0.46770832, 0.13531648, 0.29098988, 0.6575424]
  Synset 7: [0.08376061, 0.05216451, 0.008357173, 0.1701282]

Level 6 Synsets Similarity Values:
  Synset 0: [0.61335826, 0.04073567, 0.5802395, 0.6707398]
  Synset 1: [0.14698185, 0.11449183, 0.013768709, 0.29329577]
  Synset 2: [0.5057629, 0.082199104, 0.401117, 0.68773746]
  Synset 3: [0.42021513, 0.062443297, 0.33730286, 0.5239381]
  Synset 4: [0.19750372, 0.14555739, 0.024629625, 0.40799326]
  Synset 5: [0.05846564, 0.085492596, -0.01168602, 0.17882062]
  Synset 6: [0.42099822, 0.10362398, 0.33835047, 0.5671262]
  Synset 7: [0.

##### 3. Use a table like below to present the results:

In [36]:
def create_synset_similarity_table(level_synsets, model):
    # Define the columns for the DataFrame
    columns = ['Synset level', 'Synset ID', 'Words in that synset', 'Average similarity', 'Standard Deviation', 'Minimum', 'Maximum']
    data = []

    for level, synsets in level_synsets.items():
        for synset in synsets:
            words = get_words_from_synset(synset)
            # Filter words that are in the model's vocabulary
            words_in_vocab = [word for word in words if word in model.key_to_index]
            sim_values = synsetSimValue(model, words_in_vocab)
            if sim_values:
                avg, sd, min_sim, max_sim = sim_values
            else:
                avg, sd, min_sim, max_sim = [0, 0, 0, 0]
            data.append([level, synset.name(), ', '.join(words_in_vocab), avg, sd, min_sim, max_sim])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df

In [37]:
# Assuming you have the Word2Vec model and the level_synsets dictionary
level_synsets = {
    4: level_4_synsets,
    6: level_6_synsets,
    8: level_8_synsets,
    10: level_10_synsets
}

print(f"level 4 synsets: {level_4_synsets}")
similarity_table = create_synset_similarity_table(level_synsets, model)
similarity_table

level 4 synsets: [Synset('guidance.n.01'), Synset('hazard.n.01'), Synset('dissenter.n.01'), Synset('narrative.n.01'), Synset('package.n.01'), Synset('beginning.n.02'), Synset('inhabitant.n.01'), Synset('set.n.05')]


Unnamed: 0,Synset level,Synset ID,Words in that synset,Average similarity,Standard Deviation,Minimum,Maximum
0,4,guidance.n.01,"guidance, counsel, counseling, direction",0.191985,0.067588,0.05628,0.254502
1,4,hazard.n.01,"hazard, jeopardy, peril, risk, endangerment",0.369854,0.110029,0.205308,0.549644
2,4,dissenter.n.01,"dissenter, dissident, protester, objector, con...",0.2382,0.096757,0.05395,0.387849
3,4,narrative.n.01,"narrative, narration, story, tale",0.522104,0.124666,0.356167,0.685308
4,4,package.n.01,"package, bundle, packet, parcel",0.289177,0.080003,0.170921,0.418982
5,4,beginning.n.02,"beginning, commencement, first, outset, start,...",0.188652,0.137508,-0.031129,0.546315
6,4,inhabitant.n.01,"inhabitant, habitant, dweller, denizen",0.467708,0.135316,0.29099,0.657542
7,4,set.n.05,"set, circle, band, lot",0.083761,0.052165,0.008357,0.170128
8,6,dilatation.n.01,"dilatation, distension, distention",0.613358,0.040736,0.580239,0.67074
9,6,generator.n.03,"generator, source, author",0.146982,0.114492,0.013769,0.293296


In [47]:
print(get_synset_level(wn.synset('bobbin.n.01')))  # Should print 5

10


#### Cross Synset Similarity
##### 4. Consider the synsets you selected in step 1. For each level, form 8 pairs of synsets (each synset participate in two pairs).

In [68]:
# def form_synset_pairs(level_synsets, model):
#     '''
#     Form pairs of synsets at each level in the noun hypernym-hyponym tree.

#     Args:
#     level_synsets (dict): A dictionary containing synsets at different levels in the noun hypernym-hyponym tree.
#     model (gensim.models.Word2Vec): The Word2Vec model.

#     Returns:
#     dict: A dictionary containing pairs of synsets at each level.
#     '''
#     synset_pairs = {}
#     for level, synsets in level_synsets.items():
#         # Form pairs of synsets
#         pairs = []
#         for i in range(len(synsets)):
#             pair1 = (synsets[i], synsets[(i + 1) % len(synsets)])
#             pair2 = (synsets[i], synsets[(i - 1) % len(synsets)])
#             pairs.append(pair1)
#             pairs.append(pair2)
#         # Remove duplicate pairs by converting to set and back to list
#         pairs = list(set([tuple(sorted([s1.name(), s2.name()])) for s1, s2 in pairs]))
#         synset_pairs[level] = pairs
#     return synset_pairs

def form_synset_pairs(level_synsets, model):
    '''
    Form pairs of synsets at each level in the noun hypernym-hyponym tree.

    Args:
    level_synsets (dict): A dictionary containing synsets at different levels in the noun hypernym-hyponym tree.
    model (gensim.models.Word2Vec): The Word2Vec model.

    Returns:
    dict: A dictionary containing pairs of synset names (without extension) at each level.
    '''
    synset_pairs = {}
    for level, synsets in level_synsets.items():
        # Form pairs of synsets
        pairs = []
        for i in range(len(synsets)):
            pair1 = (synsets[i].name().split('.')[0], synsets[(i + 1) % len(synsets)].name().split('.')[0])
            pair2 = (synsets[i].name().split('.')[0], synsets[(i - 1) % len(synsets)].name().split('.')[0])
            pairs.append(pair1)
            pairs.append(pair2)
        # Remove duplicate pairs by converting to set and back to list
        pairs = list(set([tuple(sorted(pair)) for pair in pairs]))
        synset_pairs[level] = pairs
    return synset_pairs


In [69]:
# Adjust the display code
synset_pairs = form_synset_pairs(level_synsets, model)

# Print the pairs for each level
for level, pairs in synset_pairs.items():
    print(f"\nLevel {level} Synset Pairs:")
    for i, (synset1, synset2) in enumerate(pairs):
        print(f"  Pair {i + 1}: ({synset1}, {synset2})")




Level 4 Synset Pairs:
  Pair 1: (inhabitant, set)
  Pair 2: (beginning, package)
  Pair 3: (dissenter, hazard)
  Pair 4: (guidance, hazard)
  Pair 5: (beginning, inhabitant)
  Pair 6: (dissenter, narrative)
  Pair 7: (guidance, set)
  Pair 8: (narrative, package)

Level 6 Synset Pairs:
  Pair 1: (coupling, obscenity)
  Pair 2: (curriculum_vitae, pass)
  Pair 3: (generator, whiner)
  Pair 4: (obscenity, whiner)
  Pair 5: (coupling, pass)
  Pair 6: (dilatation, generator)
  Pair 7: (dilatation, hybrid)
  Pair 8: (curriculum_vitae, hybrid)

Level 8 Synset Pairs:
  Pair 1: (presumption, wallet)
  Pair 2: (dry_dock, fatherland)
  Pair 3: (centrifuge, wallet)
  Pair 4: (fatherland, nitroglycerin)
  Pair 5: (dry_dock, sun_parlor)
  Pair 6: (centrifuge, sun_parlor)
  Pair 7: (nitroglycerin, sled)
  Pair 8: (presumption, sled)

Level 10 Synset Pairs:
  Pair 1: (dishwasher_detergent, myocardial_infarction)
  Pair 2: (myocardial_infarction, whipping)
  Pair 3: (mannequin, sanitation)
  Pair 4: (

##### 5. For each pair, pass it to crossSynsetSimValue(model, sset1, sset2) and collect the results

In [70]:
# Calculate similarity statistics for each pair of synsets
cross_synset_results = {}
for level, pairs in synset_pairs.items():
    cross_synset_results[level] = []
    for words1, words2 in pairs:
        sim_values = crossSynsetSimValue(model, words1, words2)
        cross_synset_results[level].append(sim_values)

# Print the results
for level, results in cross_synset_results.items():
    print(f"Level {level} Cross-Synset Similarity Statistics:")
    for i, (pair, stats) in enumerate(zip(synset_pairs[level], results), start=1):
        words1, words2 = pair  # Unpack the pair into two lists of words
        print(f"  Pair {i}:")
        print(f"    Words 1: {words1}")
        print(f"    Words 2: {words2}")
        avg, sd, min_sim, max_sim = stats
        print(f"    Similarity Stats: avg: {avg:.4f}, standard_deviation: {sd:.4f}, min: {min_sim:.4f}, max: {max_sim:.4f}")

Level 4 Cross-Synset Similarity Statistics:
  Pair 1:
    Words 1: inhabitant
    Words 2: set
    Similarity Stats: avg: 0.4081, standard_deviation: 0.1961, min: 0.2115, max: 1.0000
  Pair 2:
    Words 1: beginning
    Words 2: package
    Similarity Stats: avg: 0.3856, standard_deviation: 0.1986, min: 0.1507, max: 1.0000
  Pair 3:
    Words 1: dissenter
    Words 2: hazard
    Similarity Stats: avg: 0.4224, standard_deviation: 0.1687, min: 0.2537, max: 1.0000
  Pair 4:
    Words 1: guidance
    Words 2: hazard
    Similarity Stats: avg: 0.4596, standard_deviation: 0.1395, min: 0.2537, max: 1.0000
  Pair 5:
    Words 1: beginning
    Words 2: inhabitant
    Similarity Stats: avg: 0.5036, standard_deviation: 0.2277, min: 0.2115, max: 1.0000
  Pair 6:
    Words 1: dissenter
    Words 2: narrative
    Similarity Stats: avg: 0.4227, standard_deviation: 0.2267, min: 0.1278, max: 1.0000
  Pair 7:
    Words 1: guidance
    Words 2: set
    Similarity Stats: avg: 0.3531, standard_deviation: 0

##### 6. Use a table like below to present the results (no need to list the individual words in this table):

In [73]:
# def create_cross_synset_similarity_table(synset_pairs, cross_synset_results, level_synsets):
#     # Define the columns for the DataFrame
#     columns = ['Synset level', 'Synset ID for sset1', 'Synset ID for sset2', 'Average similarity', 'Standard Deviation', 'Minimum', 'Maximum']
#     data = []

#     for level, results in cross_synset_results.items():
#         for i, (pair, stats) in enumerate(zip(synset_pairs[level], results), start=1):
#             words1, words2 = pair
#             # Find the synset IDs for the words in the pair
#             synset1 = [synset.name() for synset in level_synsets[level] if set(words1).issubset(set(get_words_from_synset(synset)))]
#             synset2 = [synset.name() for synset in level_synsets[level] if set(words2).issubset(set(get_words_from_synset(synset)))]
#             synset1_id = synset1[0] if synset1 else "N/A"
#             synset2_id = synset2[0] if synset2 else "N/A"
#             avg, sd, min_sim, max_sim = stats
#             data.append([level, synset1_id, synset2_id, avg, sd, min_sim, max_sim])

#     # Create the DataFrame
#     df = pd.DataFrame(data, columns=columns)
#     return df

def create_cross_synset_similarity_table(synset_pairs, cross_synset_results, level_synsets):
    # Define the columns for the DataFrame
    columns = ['Synset level', 'Synset ID for sset1', 'Synset ID for sset2', 'Average similarity', 'Standard Deviation', 'Minimum', 'Maximum']
    data = []

    for level, results in cross_synset_results.items():
        for i, (pair, stats) in enumerate(zip(synset_pairs[level], results), start=1):
            words1, words2 = pair
            # Find the synset IDs for the words in the pair
            synset1_id = [synset.name() for synset in level_synsets[level] if synset.name().split('.')[0] == words1][0]
            synset2_id = [synset.name() for synset in level_synsets[level] if synset.name().split('.')[0] == words2][0]
            avg, sd, min_sim, max_sim = stats
            data.append([level, synset1_id, synset2_id, avg, sd, min_sim, max_sim])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df


In [74]:
cross_synset_similarity_table = create_cross_synset_similarity_table(synset_pairs, cross_synset_results, level_synsets)
cross_synset_similarity_table

Unnamed: 0,Synset level,Synset ID for sset1,Synset ID for sset2,Average similarity,Standard Deviation,Minimum,Maximum
0,4,inhabitant.n.01,set.n.05,0.408121,0.196072,0.21153,1.0
1,4,beginning.n.02,package.n.01,0.38559,0.198553,0.150729,1.0
2,4,dissenter.n.01,hazard.n.01,0.422412,0.168732,0.253688,1.0
3,4,guidance.n.01,hazard.n.01,0.459631,0.13947,0.253688,1.0
4,4,beginning.n.02,inhabitant.n.01,0.503638,0.227697,0.21153,1.0
5,4,dissenter.n.01,narrative.n.01,0.422688,0.226678,0.127753,1.0
6,4,guidance.n.01,set.n.05,0.353062,0.168004,0.21153,1.0
7,4,narrative.n.01,package.n.01,0.342242,0.156917,0.114713,1.0
8,6,coupling.n.03,obscenity.n.02,0.443409,0.170745,0.21153,1.0
9,6,curriculum_vitae.n.01,pass.n.09,0.319829,0.099261,0.013744,0.5105


##### 7. Use the two tables, to verify/contradict the arguments made in the first 3 paragraphs in the section. You may want to do additional analysis of the numbers to achieve it. Your argument may say “the results depends on the level of the synset....”

**Conclusion**:

Based on the analysis of the same-word and cross-synset similarities, we can draw the following conclusions:

**Same-Word Similarity**: The average similarity within synsets is generally high, which supports the hypothesis that words within the same synset are semantically related. For example, at index 0, level 4, synsets such as telegrapher.n.01 show an average similarity of 0.584288, indicating strong semantic similarity among the words in the synset.

**Cross-Synset Similarity**: The average similarity between different synsets is typically lower than the same-word similarity, which aligns with the expectation that words from different synsets are less semantically related. For instance, the average similarity between telegrapher.n.01 and juju.n.02 at index 0, level 4 is 0.040873, which is significantly lower than the similarities observed within synsets.

Exceptions and Variability: Despite the general trends, there are exceptions where different synsets exhibit higher-than-expected similarity. For example, the average similarity between nonsense.n.01 and bagatelle.n.02 at level 4 is 0.311770, which is unusually high for cross-synset similarity. This suggests that the semantic relationship between synsets can be complex and not always predictable.

In conclusion, the analysis largely supports the initial arguments regarding the semantic similarities within and across synsets. Words within the same synset tend to have higher average similarity, reflecting their semantic relatedness. In contrast, words from different synsets generally show lower similarity, indicating lesser semantic relatedness. However, the presence of exceptions and variability in the data highlights the complexity of semantic relationships in natural language.

### Part 2: Comparing BERT within synsets


We can ask the similar questions for BERT instead of Word2Vec. However, since BERT is a contextualized embedding, there is no single vector associated with each word. Instead, you have to submit a sentence to BERT and it will return the vector corresponds to each word.

#### Part1 Check if same word in a different sentences would return the same vector.


In [75]:
# Load pre-trained BERT model and tokenizer 
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)

##### Helper Functions for Bert Data preprocessing

In [89]:
# def get_bert_embeddings(sentence, model, tokenizer):
#     """
#     Get BERT embeddings for a single sentence.
#     """
#     # Tokenize the input sentence and convert to PyTorch tensors
#     inputs = tokenizer(sentence, padding=True, return_tensors="pt")
#     tokenized_sentence = tokenizer.tokenize(sentence)

#     # Forward pass through the BERT model to get embeddings
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Get the hidden states (embeddings) of the last layer
#     last_hidden_states = outputs.last_hidden_state

#     # Convert tensor to numpy array
#     embeddings = last_hidden_states.numpy()

#     return embeddings, tokenized_sentence

# def getBertVector(model, word, sentences):
#     """
#     Generate BERT vectors for a word in a list of sentences.
#     """
#     vectors = []
#     for sentence in sentences:
#         # Get BERT embeddings and tokenized sentence
#         embeddings, tokenized_sentence = get_bert_embeddings(sentence, model, tokenizer)

#         # Find the position of the word or its subwords in the tokenized sentence
#         word_positions = [i for i, token in enumerate(tokenized_sentence) if word in token]

#         # If the word or its subwords are not in the sentence, add an empty list
#         if not word_positions:
#             vectors.append([])
#             continue

#         # Extract the embedding for the first occurrence of the word or its subwords
#         word_vector = embeddings[0, word_positions[0], :]
#         vectors.append(word_vector)

#     return vectors

from transformers import BertTokenizer, BertModel
import torch

def get_bert_embeddings(sentence, model):
    """
    Get BERT embeddings for a single sentence.
    """
    # Initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the input sentence and convert to PyTorch tensors
    inputs = tokenizer(sentence, padding=True, return_tensors="pt")
    tokenized_sentence = tokenizer.tokenize(sentence)

    # Forward pass through the BERT model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the hidden states (embeddings) of the last layer
    last_hidden_states = outputs.last_hidden_state

    # Convert tensor to numpy array
    embeddings = last_hidden_states.numpy()

    return embeddings, tokenized_sentence

def genBERTVector(model, word, sentences):
    """
    Generate BERT vectors for a word in a list of sentences.
    """
    vectors = []
    for sentence in sentences:
        # Get BERT embeddings and tokenized sentence
        # embeddings, tokenized_sentence = get_bert_embeddings(sentence, model)

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Tokenize the input sentence and convert to PyTorch tensors
        inputs = tokenizer(sentence, padding=True, return_tensors="pt")
        tokenized_sentence = tokenizer.tokenize(sentence)

        # Forward pass through the BERT model to get embeddings
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the hidden states (embeddings) of the last layer
        last_hidden_states = outputs.last_hidden_state

        # Convert tensor to numpy array
        embeddings = last_hidden_states.numpy()


        # Find the position of the word or its subwords in the tokenized sentence
        word_positions = [i for i, token in enumerate(tokenized_sentence) if word in token]

        # If the word or its subwords are not in the sentence, add an empty list
        if not word_positions:
            vectors.append([])
            continue

        # Extract the embedding for the first occurrence of the word or its subwords
        word_vector = embeddings[0, word_positions[0], :]
        vectors.append(word_vector)

    return vectors

    
    
def get_synset_level(synset):
    """
    Get the level of a synset in the WordNet hierarchy.

    Args:
    synset (nltk.corpus.reader.wordnet.Synset): A WordNet synset.

    Returns:
    int: The level of the synset in the hierarchy.
    """
    level = 0
    while synset.hypernyms():
        synset = synset.hypernyms()[0]
        level += 1
    return level


##### Test Part Two:
Consider a word that is in multiple sentences, will the same word’s (that have the same sense)
have the same (or similar) embedded vectors?

In [80]:
sentences_dogs = ["This dog is barking", "This dog is barking loudly", "How are you today?", "My dog is a big dog"]
vectors_dog = genBERTVector(model_bert, "dog", sentences_dogs)
for i, vec in enumerate(vectors_dog):
    print(f"V{i+1} = {vec.tolist() if len(vec) > 0 else '[]'}")

V1 = [-0.3924993872642517, 0.013372927904129028, 0.1901414841413498, -0.4516221880912781, 0.6767682433128357, 0.49663418531417847, 0.6628692150115967, 1.434213638305664, -0.33909350633621216, -0.24653920531272888, -0.11542430520057678, -1.1893110275268555, -0.12006565183401108, 0.21110528707504272, 0.03133716434240341, 0.5490267872810364, 0.08786246925592422, 0.28771302103996277, 0.4951881170272827, 0.4616583585739136, -0.13183458149433136, -0.5924156904220581, -0.29464057087898254, 0.3969518542289734, 0.28486084938049316, 0.20589354634284973, 0.37705016136169434, 0.4584537744522095, -0.008960023522377014, -0.18570205569267273, -0.4013621509075165, 0.2506321966648102, -0.05282978340983391, 0.5774250030517578, 0.016261544078588486, -0.08254845440387726, 0.21561606228351593, -0.5328277349472046, 0.035012386739254, 0.4629247188568115, -0.33626776933670044, -0.3762643039226532, 0.29860714077949524, -0.48116761445999146, -0.43893375992774963, -0.6458852887153625, -0.021414972841739655, -0.0

##### 2. For each word, call the genBERTVector() function to retrieve the vector for each word in the sentences.

In [83]:
# Load the JSON file
with open('sentences_by_synset.json', 'r') as file:
    data = json.load(file)

In [87]:
# Function to generate BERT vectors for each word in the sentences
def generate_vectors(data, model):
    vectors = {}
    for synset, words_dict in data.items():
        vectors[synset] = {}
        for word, sentences in words_dict.items():
            vectors[synset][word] = genBERTVector(model, word, sentences)
    return vectors

In [88]:
# Call the function to generate vectors
vectors = generate_vectors(data, model_bert, tokenizer)

# Example: Print the vectors for the word 'narrative' in the synset 'narrative.n.01'
print(vectors['narrative.n.01']['narrative'])


[array([-1.12604544e-01, -9.33351666e-02, -4.01862383e-01, -2.42557839e-01,
        3.76543194e-01, -4.25543785e-01,  2.35960081e-01,  4.08625960e-01,
       -6.15531027e-01,  1.49725974e-01, -1.23684831e-01, -7.66852081e-01,
       -2.54453402e-02,  1.45650253e-01,  3.25838476e-01,  5.57404041e-01,
       -3.62933911e-02, -1.36008337e-01, -6.25536621e-01, -4.03050631e-01,
       -4.63696003e-01, -2.84207106e-01, -1.01359546e-01,  2.40284801e-01,
        7.90434420e-01,  8.69410336e-02,  3.84409636e-01, -8.72452259e-02,
       -6.68790191e-03, -2.98979431e-01,  2.22948864e-01, -1.62265331e-01,
       -4.10168320e-01,  1.02834761e+00,  3.23746830e-01,  3.45036238e-02,
       -3.96756865e-02, -1.19332537e-01, -2.35320136e-01, -1.31071657e-01,
       -5.10532521e-02, -3.11366647e-01,  6.55820251e-01,  3.46194237e-01,
       -2.11990625e-03, -2.63051331e-01,  2.42024705e-01, -4.17015292e-02,
        4.59078372e-01, -1.57421008e-01, -4.67293680e-01,  6.37058973e-01,
        6.32328033e-01, 

##### 3. Calculate the pairwise cosine similarity of each pair of vector. Record the average, standard deviation, min and max similarity value

Calculate the cosine similarities and return the stats for each pair of vectors of each word embedded vectors.

In [90]:

def calculate_cosine_similarity(vectors):
    similarities = []
    for i in range(len(vectors)):
        for j in range(i + 1, len(vectors)):
            if len(vectors[i]) > 0 and len(vectors[j]) > 0:
                similarity = 1 - cosine(vectors[i], vectors[j])
                similarities.append(similarity)
    
    if len(similarities) > 0:
        avg_similarity = np.mean(similarities)
        std_dev_similarity = np.std(similarities)
        min_similarity = np.min(similarities)
        max_similarity = np.max(similarities)
        return avg_similarity, std_dev_similarity, min_similarity, max_similarity
    else:
        return 0, 0, 0, 0

In [102]:
synset_stats = {}

for synset, words in vectors.items():
    synset_stats[synset] = {}
    for word, vecs in words.items():
        if vecs:  # Check if there are any vectors for this word
            avg_similarity, std_dev_similarity, min_similarity, max_similarity = calculate_cosine_similarity(vecs)
            synset_stats[synset][word] = {
                'Average similarity': avg_similarity,
                'Standard deviation': std_dev_similarity,
                'Minimum similarity': min_similarity,
                'Maximum similarity': max_similarity
            }

# Print the stats for each synset and word
for synset, words_stats in synset_stats.items():
    print(f"\nSynset: {synset}")
    for word, stats in words_stats.items():
        print(f"  Word: {word}")
        for stat_name, stat_value in stats.items():
            print(f"    {stat_name}: {stat_value}")


Synset: narrative.n.01
  Word: narrative
    Average similarity: 0.5137321949005127
    Standard deviation: 0.21643478646201697
    Minimum similarity: 0.28049299120903015
    Maximum similarity: 0.8692008852958679
  Word: narration
    Average similarity: 0.5676038960615793
    Standard deviation: 0.161876625418017
    Minimum similarity: 0.38016167283058167
    Maximum similarity: 0.8438411951065063
  Word: story
    Average similarity: 0.5570192684729894
    Standard deviation: 0.11659466822751499
    Minimum similarity: 0.3718770742416382
    Maximum similarity: 0.6755340099334717
  Word: tale
    Average similarity: 0.5661893089612325
    Standard deviation: 0.13611079653857722
    Minimum similarity: 0.40922173857688904
    Maximum similarity: 0.7945946455001831

Synset: package.n.01
  Word: package
    Average similarity: 0.4680553525686264
    Standard deviation: 0.16898095149378142
    Minimum similarity: 0.2804751992225647
    Maximum similarity: 0.7805398106575012
  Word: b

##### 4. Present the result in a table below. Each row should correspond to a word

Display the table for all the extracted vectors of similar word in different sentences in the same synset.

In [101]:
def display_synset_statistics(synset_stats):
    """
    Present the statistics for each synset in a table.

    Args:
    synset_stats (dict): A dictionary where keys are synset IDs and values are dictionaries of words and their statistics.

    Returns:
    pandas.DataFrame: A DataFrame containing the statistics for each word in each synset.
    """
    data = []
    for synset_id, words_stats in synset_stats.items():
        # Find the level of the synset
        level = get_synset_level(wn.synset(synset_id))
        for word, stats in words_stats.items():
            data.append([
                level,
                synset_id,
                word,
                stats['Average similarity'],
                stats['Standard deviation'],
                stats['Minimum similarity'],
                stats['Maximum similarity']
            ])

    # Create a DataFrame to display the results
    df = pd.DataFrame(data, columns=['Synset level', 'Synset ID', 'Word', 'Average Similarity', 'Standard Deviation', 'Minimum', 'Maximum'])
    return df


In [103]:
# # Example usage
# synset_stats = {
#     "narrative.n.01": {
#         "narrative": {
#             'Average similarity': 0.5137,
#             'Standard deviation': 0.2164,
#             'Minimum similarity': 0.2805,
#             'Maximum similarity': 0.8692
#         },
#         "narration": {
#             'Average similarity': 0.5676,
#             'Standard deviation': 0.1619,
#             'Minimum similarity': 0.3802,
#             'Maximum similarity': 0.8438
#         }
#     }
# }

df = display_synset_statistics(synset_stats)
df

Unnamed: 0,Synset level,Synset ID,Word,Average Similarity,Standard Deviation,Minimum,Maximum
0,4,narrative.n.01,narrative,0.513732,0.216435,0.280493,0.869201
1,4,narrative.n.01,narration,0.567604,0.161877,0.380162,0.843841
2,4,narrative.n.01,story,0.557019,0.116595,0.371877,0.675534
3,4,narrative.n.01,tale,0.566189,0.136111,0.409222,0.794595
4,4,package.n.01,package,0.468055,0.168981,0.280475,0.78054
5,4,package.n.01,bundle,0.530151,0.163049,0.291323,0.730898
6,4,package.n.01,packet,0.582036,0.122606,0.478238,0.849265
7,4,package.n.01,parcel,0.467618,0.234115,0.266685,0.804487
8,6,obscenity.n.02,obscenity,0.0,0.0,0.0,0.0
9,6,obscenity.n.02,smut,0.0,0.0,0.0,0.0


In [1]:
# Example usage
file_path = "sentences_by_synset.json"

# Get synsets at different levels
level_4_synsets = get_synsets_at_level(w2vModel, 4, 8)  # Get 8 synsets at level 4 (less specific than level 6)
level_6_synsets = get_synsets_at_level(w2vModel, 6, 8)  # Get 8 synsets at level 6 (less specific than level 8)
level_8_synsets = get_synsets_at_level(w2vModel, 8, 8)  # Get 8 synsets at level 8 (less specific than level 10)
level_10_synsets = get_synsets_at_level(w2vModel, 10, 8)  # Get 8 synsets at level 10

# Assuming you have the Word2Vec model and the level_synsets dictionary
level_synsets = {
    4: level_4_synsets,
    6: level_6_synsets,
    8: level_8_synsets,
    10: level_10_synsets
}

# Load the sentences from the JSON file
with open("sentences_by_synset.json", "r") as f:
    sentences_by_synset = json.load(f)

# Generate BERT vectors for each word in the sentences of each synset
synset_vectors = {}
for synset, sentences in sentences_by_synset.items():
    word = synset.split('.')[0]  # Extract the word from the synset ID
    vectors = genBERTVector(model_bert, word, sentences)
    synset_vectors[synset] = vectors
    # print(f"{synset}: {len(sentences)} sentences")

# Calculate statistics for each synset and display them
statistics_df = display_synset_statistics(synset_vectors, level_synsets)
statistics_df

NameError: name 'get_synsets_at_level' is not defined

In [123]:
print(f"get level for synset telegrapher.n.01 {get_synset_level(wn.synset('backbone.n.02'))}")

get level for synset telegrapher.n.01 8


##### Calculate Cosine for Cross Synset words
Now consider each pair of words in the same synset. Calculate the cosine similarity of all pair of vectors, where the two vectors corresponds to the different word.
For example, let say for the above case, the four vectors corresponds the word layer is l1, l2, l3, l4. And the four vectors corresponds to bed is b1, b2, b3, b4
Then you should calculate the cosine similarity of (v, w) where v is one of the l’s and w is one of the b’s. So you should get 16 numbers.
If you synset has more than 2 words, than do the same for every pair of words.
You should once again, calculate average, standard deviation, mean and max for all the cases.

In [226]:
def get_word_vectors(model, tokenizer, word, sentence):
    """
    Generate BERT vectors for a word using a given sentence.

    Args:
    model (transformers.BertModel): A BERT model.
    tokenizer (transformers.BertTokenizer): A BERT tokenizer.
    word (str): A word.
    sentence (str): A sentence containing the word.

    Returns:
    list: A list containing the vector for the word.
    """
    vectors = getBertVector(model, tokenizer, word, [sentence])
    return {word: vectors}


import numpy as np
from scipy.spatial.distance import cosine

def calculate_cross_word_similarity(synset_vectors):
    """
    Calculate the cosine similarity for each pair of vectors corresponding to different words in the same synset.

    Args:
    synset_vectors (dict): A dictionary where keys are words and values are lists of vectors for each word.

    Returns:
    dict: A dictionary containing the statistics (average, standard deviation, minimum, maximum) for each word pair.
    """
    similarities = []
    word_pairs = combinations(synset_vectors.keys(), 2)
    
    for word1, word2 in word_pairs:
        vectors1 = synset_vectors[word1]
        vectors2 = synset_vectors[word2]
        for vec1 in vectors1:
            for vec2 in vectors2:
                # Convert vec1 and vec2 to NumPy arrays if they are not already
                vec1 = np.array(vec1) if not isinstance(vec1, np.ndarray) else vec1
                vec2 = np.array(vec2) if not isinstance(vec2, np.ndarray) else vec2

                if vec1.size > 0 and vec2.size > 0:  # Ensure both vectors are not empty
                    similarity = 1 - cosine(vec1, vec2)
                    similarities.append(similarity)
    
    # Calculate statistics
    if similarities:
        avg_similarity = np.mean(similarities)
        std_deviation = np.std(similarities)
        min_similarity = np.min(similarities)
        max_similarity = np.max(similarities)
    else:
        avg_similarity = std_deviation = min_similarity = max_similarity = 0
    
    return {
        "average": avg_similarity,
        "standard_deviation": std_deviation,
        "minimum": min_similarity,
        "maximum": max_similarity
    }



In [227]:
# Load the JSON data
with open('cross_words_synsets.json', 'r') as file:
    synset_data = json.load(file)

# Initialize a dictionary to store the vectors for each synset
synset_vectors = {}

# Iterate over each synset in the JSON
for synset, words_sentences in synset_data.items():
    # Initialize a dictionary to store vectors for each word in the synset
    word_vectors = {}
    for word, sentences in words_sentences.items():
        print(f"Processing word {word} in synset {synset}")
        # Generate vectors for the word using its associated sentences
        vectors = get_word_vectors(model_bert, tokenizer, word, sentences)
        word_vectors[word] = vectors[word]  # Store the vectors for the word
    synset_vectors[synset] = word_vectors  # Store the vectors for the synset




Processing word nonsense in synset nonsense.n.01
Processing word bunk in synset nonsense.n.01
Processing word nonsensicality in synset nonsense.n.01
Processing word meaninglessness in synset nonsense.n.01
Processing word hokum in synset nonsense.n.01
Processing word narrative in synset narrative.n.01
Processing word narration in synset narrative.n.01
Processing word story in synset narrative.n.01
Processing word tale in synset narrative.n.01
Processing word garbage in synset garbage.n.01
Processing word refuse in synset garbage.n.01
Processing word food_waste in synset garbage.n.01
Processing word scraps in synset garbage.n.01
Processing word chaos in synset chaos.n.01
Processing word pandemonium in synset chaos.n.01
Processing word bedlam in synset chaos.n.01
Processing word topsy-turvydom in synset chaos.n.01
Processing word topsy-turvyness in synset chaos.n.01
Processing word plaza in synset plaza.n.01
Processing word place in synset plaza.n.01
Processing word piazza in synset plaza

In [228]:
# Calculate the cosine similarity statistics for each synset
synset_similarity_stats = {}
for synset, vectors in synset_vectors.items():
    print(f" vector shape is {vectors.keys()}")
    similarity_stats = calculate_cross_word_similarity(vectors)
    synset_similarity_stats[synset] = similarity_stats

# Print the results
for synset, stats in synset_similarity_stats.items():
    print(f"Synset: {synset}")
    print(f"Average similarity: {stats['average']}")
    print(f"Standard deviation: {stats['standard_deviation']}")
    print(f"Minimum similarity: {stats['minimum']}")
    print(f"Maximum similarity: {stats['maximum']}")
    print()

 vector shape is dict_keys(['nonsense', 'bunk', 'nonsensicality', 'meaninglessness', 'hokum'])
 vector shape is dict_keys(['narrative', 'narration', 'story', 'tale'])
 vector shape is dict_keys(['garbage', 'refuse', 'food_waste', 'scraps'])
 vector shape is dict_keys(['chaos', 'pandemonium', 'bedlam', 'topsy-turvydom', 'topsy-turvyness'])
 vector shape is dict_keys(['plaza', 'place', 'piazza'])
 vector shape is dict_keys(['backbone', 'grit', 'guts', 'moxie', 'sand', 'gumption'])
 vector shape is dict_keys(['hammer', 'pound', 'hammering', 'pounding'])
 vector shape is dict_keys(['woodworker', 'woodsman', 'woodman'])
Synset: nonsense.n.01
Average similarity: 0.7156285643577576
Standard deviation: 0.0
Minimum similarity: 0.7156285643577576
Maximum similarity: 0.7156285643577576

Synset: narrative.n.01
Average similarity: 0.4968409736951192
Standard deviation: 0.08890851340325828
Minimum similarity: 0.3732214868068695
Maximum similarity: 0.6119920015335083

Synset: garbage.n.01
Average sim

In [239]:
def display_similarity_table(similarity_stats):
    """
    Display a table of similarity statistics.

    Args:
    similarity_stats (dict): A dictionary containing the statistics for each synset.
    """
    # Initialize lists to store data
    synsets = []
    synset_ids = []
    words = []
    averages = []
    standard_deviations = []
    minimums = []
    maximums = []
    
    # Iterate over the similarity statistics and extract data
    for synset, stats in similarity_stats.items():
        synsets.append(synset)
        synset_ids.append(synset)
        words_in_synset = ', '.join(stats.get('Words in Synset', []))
        words.append(words_in_synset)
        averages.append(stats.get('average', ''))
        standard_deviations.append(stats.get('standard_deviation', ''))
        minimums.append(stats.get('minimum', ''))
        maximums.append(stats.get('maximum', ''))
    
    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Synset': synsets,
        'Synset ID': synset_ids,
        'Words in Synset': words,
        'Average': averages,
        'Standard Deviation': standard_deviations,
        'Minimum': minimums,
        'Maximum': maximums,
    })
    
    # Display the DataFrame
    display(df)

In [241]:
# Example usage
# synset_similarity_stats = {
#     "nonsense.n.01": {
#         "Average similarity": 0.7156285643577576,
#         "Standard deviation": 0.0,
#         "Minimum similarity": 0.7156285643577576,
#         "Maximum similarity": 0.7156285643577576
#     },
#     # Add more synsets here...
# }
# for synset, stats in synset_similarity_stats.items():
#     stats['Words in Synset'] = synset_vectors[synset].keys()
for synset, vectors in synset_vectors.items():
    similarity_stats = calculate_cross_word_similarity(vectors)
    synset_similarity_stats[synset] = similarity_stats
    display_similarity_table(synset_similarity_stats)

Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


Unnamed: 0,Synset,Synset ID,Words in Synset,Average,Standard Deviation,Minimum,Maximum
0,nonsense.n.01,nonsense.n.01,,0.715629,0.0,0.715629,0.715629
1,narrative.n.01,narrative.n.01,,0.496841,0.088909,0.373221,0.611992
2,garbage.n.01,garbage.n.01,,0.282857,0.0,0.282857,0.282857
3,chaos.n.01,chaos.n.01,,0.0,0.0,0.0,0.0
4,plaza.n.01,plaza.n.01,,0.358596,0.074092,0.283067,0.459257
5,backbone.n.02,backbone.n.02,,0.365023,0.073182,0.247944,0.463982
6,hammer.n.08,hammer.n.08,,0.482632,0.078773,0.350948,0.606158
7,woodworker.n.01,woodworker.n.01,,0.0,0.0,0.0,0.0


In [156]:
# Generate BERT vectors for each word in the sentences of each synset
synset_word_vectors = {}
for synset, sentences in sentences_by_synset.items():
    words = get_words_from_synset(wn.synset(synset))
    word_vectors = get_word_vectors(model_bert, tokenizer, words, sentences)
    synset_word_vectors[synset] = word_vectors

In [173]:
# Load the JSON file with the test sentences
with open('./sentences_by_synset.json', 'r') as f:
    synset_sentences = json.load(f)

# Example usage
synsets = list(synset_sentences.keys())
synset_word_vectors = get_synset_word_vectors(model_bert, tokenizer, synsets, synset_sentences)

# Display the output
for synset in synset_word_vectors:
    print(f"synset: {synset}")
    for i, vec in enumerate(synset_word_vectors[synset]):
        print(f"  V{i+1} = {vec.tolist() if len(vec) > 0 else '[]'}")

synset: narrative.n.01
  V1 = [0.38875812292099, -0.13178572058677673, -0.11787796020507812, -0.3684121370315552, 0.5088706612586975, 0.3659471869468689, -0.005975520238280296, 0.15390896797180176, -0.060689277946949005, -0.5568908452987671, 0.3363109827041626, -0.5056047439575195, 0.1714106947183609, 0.3412712812423706, 0.713886559009552, 0.763067364692688, 0.642367959022522, 0.02222200483083725, -0.290627658367157, -0.455409973859787, -0.20023810863494873, -0.3004964590072632, -0.18695124983787537, 0.4546849727630615, 0.12459041178226471, 0.45067787170410156, 0.2225061058998108, -0.340934157371521, -0.3550286889076233, 0.08937957137823105, 0.5565313100814819, -0.11073529720306396, -0.5534806251525879, -0.41657763719558716, 0.2593547999858856, -0.2384353131055832, -0.5306353569030762, 0.39153796434402466, -0.5655820369720459, -0.5833818912506104, -0.28644227981567383, -0.2368888258934021, 0.5619072914123535, -0.0061843544244766235, 0.18694612383842468, -0.597137451171875, 0.1084817349

In [153]:
# Example usage
file_path = "sentences_by_synset.json"

# Get synsets at different levels
level_4_synsets = get_synsets_at_level(w2vModel, 4, 8)  # Get 8 synsets at level 4 (less specific than level 6)
level_6_synsets = get_synsets_at_level(w2vModel, 6, 8)  # Get 8 synsets at level 6 (less specific than level 8)
level_8_synsets = get_synsets_at_level(w2vModel, 8, 8)  # Get 8 synsets at level 8 (less specific than level 10)
level_10_synsets = get_synsets_at_level(w2vModel, 10, 8)  # Get 8 synsets at level 10



# Load the sentences from the JSON file
with open("sentences_by_synset.json", "r") as f:
    sentences_by_synset = json.load(f)

# Generate BERT vectors for each word in the sentences of each synset
synset_vectors = {}
for synset, sentences in sentences_by_synset.items():
    print(f"synset: {synset}")
    # word = synset.split('.')[0]  # Extract the word from the synset ID
    
    word = get_words_from_synset(wn.synset(synset))
    print(f"word: {word}")
    vectors = getBertVector(model_bert, tokenizer, word[0], sentences)
    synset_vectors[synset] = vectors

synset: narrative.n.01
word: ['narrative', 'narration', 'story', 'tale']
synset: telegrapher.n.01
word: ['telegrapher', 'telegraphist', 'telegraph_operator']
synset: garbage.n.01
word: ['garbage', 'refuse', 'food_waste', 'scraps']
synset: chaos.n.01
word: ['chaos', 'pandemonium', 'bedlam', 'topsy-turvydom', 'topsy-turvyness']
synset: plaza.n.01
word: ['plaza', 'place', 'piazza']
synset: backbone.n.02
word: ['backbone', 'grit', 'guts', 'moxie', 'sand', 'gumption']
synset: hammer.n.08
word: ['hammer', 'pound', 'hammering', 'pounding']
synset: soil_profile.n.01
word: ['soil_profile']


In [257]:
def get_synsets_at_leve_for_testing(level, num_synsets=10):
    # Initialize a queue for BFS with the root synsets (i.e., synsets without hypernyms)
    queue = [(synset, 0) for synset in wn.all_synsets() if not synset.hypernyms()]
    synsets_at_level = []

    # Perform BFS
    while queue and len(synsets_at_level) < num_synsets:
        current_synset, current_level = queue.pop(0)
        
        # Check if we have reached the desired level
        if current_level == level:
            synsets_at_level.append(current_synset)
        elif current_level < level:
            # Enqueue children (hyponyms) of the current synset
            for hyponym in current_synset.hyponyms():
                queue.append((hyponym, current_level + 1))

    return synsets_at_level

# Example usage
level = 10
synsets = get_synsets_at_leve_for_testing(level, num_synsets=8)
for synset in synsets:
    print(f"Level {level}: {synset}")


Level 10: Synset('soil_profile.n.01')
Level 10: Synset('fulsomeness.n.02')
Level 10: Synset('sanctimoniousness.n.01')
Level 10: Synset('backbone.n.02')
Level 10: Synset('fairness.n.01')
Level 10: Synset('right.n.07')
Level 10: Synset('frugality.n.01')
Level 10: Synset('providence.n.04')


In [259]:
print(get_words_from_synset(wn.synset('right.n.07')))

['right', 'rightfulness']


In [211]:

# print(f"synset_vectors: {synset_vectors.items()}")

for synset in synset_vectors.items():
    print(f"{synset[0]}: sentences")
    word = get_words_from_synset(wn.synset("bagatelle.n.02"))
    print(f"word: {word}")

telegrapher.n.01: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']
narrative.n.01: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']
garbage.n.01: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']
chaos.n.01: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']
plaza.n.01: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']
backbone.n.02: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']
hammer.n.08: sentences
word: ['bagatelle', 'fluff', 'frippery', 'frivolity']


In [249]:
sentence = "The children built sandcastles on the beach during their vacation."
tokenized_sentence = tokenizer.tokenize(sentence)
print(tokenized_sentence)

['the', 'children', 'built', 'sand', '##castle', '##s', 'on', 'the', 'beach', 'during', 'their', 'vacation', '.']


In [261]:
def get_synset_level(synset):
    """
    Get the level of a synset in WordNet.

    Args:
    synset (nltk.corpus.reader.wordnet.Synset): A WordNet synset.

    Returns:
    int: The level of the synset in the WordNet hierarchy.
    """
    level = 0
    while synset.hypernyms():
        synset = synset.hypernyms()[0]
        level += 1
    return level

In [267]:
get_synset_level(wn.synset('scotch.n.03'))

WordNetError: Lemma 'scotch' with part of speech 'n' only has 2 senses