<a href="https://colab.research.google.com/github/Vengalagagan/NLP/blob/main/2403A52222_NLP_Assignmnet_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install Gensim Import Libraries**

In [10]:
!pip install gensim




In [11]:
# gensim is used to load and work with pre-trained word embedding models
# It provides Word2Vec, GloVe, FastText implementations
import gensim

# KeyedVectors is specifically used to load pre-trained word embeddings
# without loading the full training model
from gensim.models import KeyedVectors

# numpy is used for numerical operations on vectors
# Word embeddings are stored as numerical arrays
import numpy as np

# sklearn.metrics.pairwise is used to calculate similarity between vectors
# cosine_similarity helps measure semantic similarity between words
from sklearn.metrics.pairwise import cosine_similarity

# matplotlib is used to visualize word embeddings in 2D space
import matplotlib.pyplot as plt

# **Load Word2Vec and find numerical vector representation of words**

In [3]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model (may take time on first download)
model = api.load("word2vec-google-news-300")

# Print vocabulary size
print("Vocabulary Size:", len(model.key_to_index))

# Display vector for a sample word
word = "king"
vector = model[word]

print("\nWord:", word)
print("Vector length:", len(vector))
print("First 10 values of the vector:\n", vector[:10])

Vocabulary Size: 3000000

Word: king
Vector length: 300
First 10 values of the vector:
 [ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]


# **Load Glove and find numerical vector representation of words**

In [4]:
import gensim.downloader as api

# Load GloVe embeddings (100-dimensional)
model = api.load("glove-wiki-gigaword-100")

# Print vocabulary size
print("Vocabulary Size:", len(model.key_to_index))

# Display vector for a sample word
word = "king"
vector = model[word]

print("\nWord:", word)
print("Vector length:", len(vector))
print("First 10 values of the vector:\n", vector[:10])

Vocabulary Size: 400000

Word: king
Vector length: 100
First 10 values of the vector:
 [-0.32307 -0.87616  0.21977  0.25268  0.22976  0.7388  -0.37954 -0.35307
 -0.84369 -1.1113 ]


# **Word Similarity with Word2Vec**

In [5]:
# Define word pairs for similarity calculation
word_pairs = [
    ("woman", "man"),
    ("king", "queen"),
    ("apple", "fruit"),
    ("car", "automobile"),
    ("doctor", "hospital"),
    ("dog", "cat"),
    ("happy", "sad"),
    ("big", "large"),
    ("run", "walk"),
    ("house", "home")
]

print("Word Similarity Scores (Word2Vec Google News):")
print("---------------------------------------------------")

for w1, w2 in word_pairs:
    try:
        similarity = model.similarity(w1, w2)
        print(f"{w1} - {w2} : {similarity:.4f}")
    except KeyError as e:
        print(f"Could not find one or both words in the vocabulary for '{w1}' and '{w2}': {e}")

Word Similarity Scores (Word2Vec Google News):
---------------------------------------------------
woman - man : 0.8323
king - queen : 0.7508
apple - fruit : 0.5359
car - automobile : 0.6832
doctor - hospital : 0.6901
dog - cat : 0.8798
happy - sad : 0.6801
big - large : 0.7082
run - walk : 0.6683
house - home : 0.6720


### **Word Similarity with GloVe**



In [6]:
import gensim.downloader as api

# Load pre-trained GloVe model (100D)
# If 'model' from the previous GloVe loading step is not available,
# or if the kernel restarts, uncomment the line below:
model = api.load("glove-wiki-gigaword-100")

# Define word pairs for similarity calculation
word_pairs = [
    ("woman", "man"),
    ("king", "queen"),
    ("apple", "fruit"),
    ("car", "automobile"),
    ("doctor", "hospital"),
    ("dog", "cat"),
    ("happy", "sad"),
    ("big", "large"),
    ("run", "walk"),
    ("house", "home")
]

print("Word Similarity Scores (GloVe Wiki Gigaword):")
print("---------------------------------------------------")

for w1, w2 in word_pairs:
    try:
        similarity = model.similarity(w1, w2)
        print(f"{w1} - {w2} : {similarity:.4f}")
    except KeyError as e:
        print(f"Could not find one or both words in the vocabulary for '{w1}' and '{w2}': {e}")

Word Similarity Scores (GloVe Wiki Gigaword):
---------------------------------------------------
woman - man : 0.8323
king - queen : 0.7508
apple - fruit : 0.5359
car - automobile : 0.6832
doctor - hospital : 0.6901
dog - cat : 0.8798
happy - sad : 0.6801
big - large : 0.7082
run - walk : 0.6683
house - home : 0.6720


# **Load Word2Vec and complete Neighbour words**

In [7]:
# Define a list of words for which to find nearest neighbors
chosen_words = ["king", "university", "doctor", "car", "music", "java", "python"]

print("Nearest Neighbor Exploration (Word2Vec Google News):\n")

for word in chosen_words:
    print(f"\nTop similar words for '{word}':\n")
    try:
        # Find the top 5 most similar words for the current word
        similar_words = model.most_similar(word, topn=5)
        for similar_word, score in similar_words:
            print(f"{similar_word} : {score:.4f}")
    except KeyError:
        print(f"Word '{word}' not found in vocabulary.")

Nearest Neighbor Exploration (Word2Vec Google News):


Top similar words for 'king':

prince : 0.7682
queen : 0.7508
son : 0.7021
brother : 0.6986
monarch : 0.6978

Top similar words for 'university':

college : 0.8294
harvard : 0.8156
yale : 0.8114
professor : 0.8104
graduate : 0.7993

Top similar words for 'doctor':

physician : 0.7673
nurse : 0.7522
dr. : 0.7175
doctors : 0.7081
patient : 0.7074

Top similar words for 'car':

vehicle : 0.8631
truck : 0.8598
cars : 0.8372
driver : 0.8186
driving : 0.7813

Top similar words for 'music':

musical : 0.8128
songs : 0.7978
dance : 0.7897
pop : 0.7863
recording : 0.7651

Top similar words for 'java':

sumatra : 0.6642
surabaya : 0.6600
semarang : 0.6302
sulawesi : 0.6134
yogyakarta : 0.6033

Top similar words for 'python':

monty : 0.6886
php : 0.5865
perl : 0.5784
cleese : 0.5447
flipper : 0.5113


# **Load Glove and complete Neighbour words**


In [8]:
import gensim.downloader as api

# Load pre-trained GloVe model (100D)
# If 'model' from previous steps is not available or kernel restarts, uncomment below:
model = api.load("glove-wiki-gigaword-100")

# Define a list of words for which to find nearest neighbors
chosen_words = ["king", "university", "doctor", "car", "music", "java", "python"]

print("Nearest Neighbor Exploration (GloVe Wiki Gigaword):\n")

for word in chosen_words:
    print(f"\nTop similar words for '{word}':\n")
    try:
        # Find the top 5 most similar words for the current word
        similar_words = model.most_similar(word, topn=5)
        for similar_word, score in similar_words:
            print(f"{similar_word} : {score:.4f}")
    except KeyError:
        print(f"Word '{word}' not found in vocabulary.")

Nearest Neighbor Exploration (GloVe Wiki Gigaword):


Top similar words for 'king':

prince : 0.7682
queen : 0.7508
son : 0.7021
brother : 0.6986
monarch : 0.6978

Top similar words for 'university':

college : 0.8294
harvard : 0.8156
yale : 0.8114
professor : 0.8104
graduate : 0.7993

Top similar words for 'doctor':

physician : 0.7673
nurse : 0.7522
dr. : 0.7175
doctors : 0.7081
patient : 0.7074

Top similar words for 'car':

vehicle : 0.8631
truck : 0.8598
cars : 0.8372
driver : 0.8186
driving : 0.7813

Top similar words for 'music':

musical : 0.8128
songs : 0.7978
dance : 0.7897
pop : 0.7863
recording : 0.7651

Top similar words for 'java':

sumatra : 0.6642
surabaya : 0.6600
semarang : 0.6302
sulawesi : 0.6134
yogyakarta : 0.6033

Top similar words for 'python':

monty : 0.6886
php : 0.5865
perl : 0.5784
cleese : 0.5447
flipper : 0.5113


# **Word Analogy with Word2Vec**

In [12]:
import gensim.downloader as api

# Ensure the correct Word2Vec model is loaded, if not already in the kernel state.
# If `model` from the previous Word2Vec loading step is not available,
# or if the kernel restarts, uncomment the line below:
model = api.load("word2vec-google-news-300")

# Analogy 1: King - man + woman = queen
result1 = model.most_similar(
    positive=["king", "woman"],
    negative=["man"],
    topn=5
)

# Analogy 2: Paris - France + India = ? (Capital analogy)
result2 = model.most_similar(
    positive=["paris", "india"],
    negative=["france"],
    topn=5
)

# Analogy 3: Doctor - hospital + school = ? (Profession/Location analogy)
result3 = model.most_similar(
    positive=["doctor", "school"],
    negative=["hospital"],
    topn=5
)

print("\n--- Word Analogies with Word2Vec ---")

print("\nking - man + woman = ?")
print(result1)

print("\nparis - france + india = ?")
print(result2)

print("\ndoctor - hospital + school = ?")
print(result3)


--- Word Analogies with Word2Vec ---

king - man + woman = ?
[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]

paris - france + india = ?
[('chennai', 0.5442505478858948), ('delhi', 0.5149926543235779), ('mumbai', 0.5024341344833374), ('hyderabad', 0.49932485818862915), ('gujarat', 0.48732805252075195)]

doctor - hospital + school = ?
[('guidance_counselor', 0.5969594717025757), ('teacher', 0.5755364298820496), ('eighth_grade', 0.5226408243179321), ('schoolers', 0.5168290138244629), ('elementary', 0.5085657238960266)]


# **Load Glove and complete word analogy**

In [13]:
import gensim.downloader as api

# Ensure the correct GloVe model is loaded, if not already in the kernel state.
# If `model` from the previous GloVe loading step is not available,
# or if the kernel restarts, uncomment the line below:
model = api.load("glove-wiki-gigaword-100")

# Analogy 1: King - man + woman = queen
result1 = model.most_similar(
    positive=["king", "woman"],
    negative=["man"],
    topn=5
)

# Analogy 2: Paris - France + India = ? (Capital analogy)
result2 = model.most_similar(
    positive=["paris", "india"],
    negative=["france"],
    topn=5
)

# Analogy 3: Doctor - hospital + school = ? (Profession/Location analogy)
result3 = model.most_similar(
    positive=["doctor", "school"],
    negative=["hospital"],
    topn=5
)

print("\n--- Word Analogies with GloVe ---")

print("\nking - man + woman = ?")
print(result1)

print("\nparis - france + india = ?")
print(result2)

print("\ndoctor - hospital + school = ?")
print(result3)


--- Word Analogies with GloVe ---

king - man + woman = ?
[('queen', 0.7698540687561035), ('monarch', 0.6843381524085999), ('throne', 0.6755736470222473), ('daughter', 0.6594556570053101), ('princess', 0.6520534157752991)]

paris - france + india = ?
[('delhi', 0.8654932975769043), ('mumbai', 0.7718895077705383), ('bombay', 0.7222235798835754), ('dhaka', 0.6891742944717407), ('calcutta', 0.6761991381645203)]

doctor - hospital + school = ?
[('teacher', 0.7837691307067871), ('taught', 0.7343935370445251), ('graduate', 0.7131244540214539), ('student', 0.6969646215438843), ('college', 0.6951188445091248)]
