In [1]:
from scipy.spatial.distance import cosine
from gensim.models import KeyedVectors
from sklearn.preprocessing import normalize

### This program creates an interactive demo where a user can:
> 1. Find cosine similarity between two words
> 2. Add two words (e.g., Woman + King = Queen)
> 3. Add and subtract words (e.g, Tokyo + England - Japan) = London

To proceed, run the cell containing 'Interactive Demo' and choose Option 1, 2, or 3.
Enter '4' to exit.

Before proceeding check you have downloaded a pre-trained model and have it saved in your path_model directory.
For help on how to do this, consult the repo README.

Enjoy!

T. Mehta, 02/1/25 

In [2]:
# Load the Google News Word2Vec model
path_model="Q:\\Oflog\\NLP\\Inputs\\Pretrained_WE_models\\"
word_vectors = KeyedVectors.load(path_model+"vectors_word2vec-google-news-300.kv")

In [3]:
# Function to calculate cosine similarity between two words
def calculate_similarity(word1, word2):
    try:
        return word_vectors.similarity(word1, word2)
    except KeyError as e:
        raise ValueError(f"Word '{e.args[0]}' not found in the vocabulary.")



# Function to find the closest word to a given vector
def find_closest_word(vector, exclude_words=None):
    if exclude_words is None:
        exclude_words = set()
    
    # Most similar words in the vocabulary
    similar_words = word_vectors.most_similar(positive=[vector], topn=100)
    
    # Filter out excluded words
    for word, _ in similar_words:
        if word not in exclude_words:
            return word
    return None

    

# Function to perform word addition/subtraction
def word_math(word1, word2, operation, word3=None):
    try:
        vec1 = word_vectors[word1]
        vec2 = word_vectors[word2]
        
        if operation == "add":
            result_vector = vec1 + vec2
        elif operation == "subtract" and word3:
            vec3 = word_vectors[word3]
            result_vector = vec1 - vec3 + vec2
        else:
            raise ValueError("Invalid operation. Use 'add' or 'subtract' with appropriate arguments.")

        result_vector = normalize(result_vector.reshape(1, -1))[0]
        
        # Find the closest word, excluding input words
        closest_word = find_closest_word(result_vector, exclude_words=[word1, word2, word3])
        return closest_word
    except KeyError as e:
        raise ValueError(f"Word '{e.args[0]}' not found in the vocabulary.")



In [4]:
def interactive_demo():
    while True:
    
        choice = input("Enter your choice (1-4): ").strip()
        
        if choice == "1":
            word1 = input("Enter the first word: ").strip()
            word2 = input("Enter the second word: ").strip()
            try:
                similarity = calculate_similarity(word1, word2)
                print(f"Cosine Similarity between '{word1}' and '{word2}': {similarity:.4f}")
                break
            except Exception as e:
                print(f"Error: {e}")
                break
        
        elif choice == "2":
            word1 = input("Enter the first word: ").strip()
            word2 = input("Enter the second word: ").strip()
            try:
                result = word_math(word1, word2, "add")
                print(f"Result of '{word1} + {word2}': {result}")
                break
            except Exception as e:
                print(f"Error: {e}")
                break
        
        elif choice == "3":
            word1 = input("Enter the base word: ").strip()
            word2 = input("Enter the word to add: ").strip()
            word3 = input("Enter the word to subtract: ").strip()
            try:
                result = word_math(word1, word2, "subtract", word3)
                print(f"Result of '{word1} + {word2} - {word3}': {result}")
                break
            except Exception as e:
                print(f"Error: {e}")
                break
        
        elif choice == "4":
            print("Exiting demo. Goodbye!")
            break
        else:
            print("Invalid choice. Please enter a number between 1 and 4.")
            break


In [11]:
interactive_demo()

Result of 'London + India - England': Mumbai
