<a href="https://colab.research.google.com/github/Niharika9948/NLP/blob/main/2403A52234_Assignment_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import required libraries**

In [4]:
!pip install gensim
!pip install matplotlib

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [5]:
# gensim is used to load and work with pre-trained word embedding models
# It provides Word2Vec, GloVe, FastText implementations
import gensim

# KeyedVectors is specifically used to load pre-trained word embeddings
# without loading the full training model
from gensim.models import KeyedVectors

# numpy is used for numerical operations on vectors
# Word embeddings are stored as numerical arrays
import numpy as np

# sklearn.metrics.pairwise is used to calculate similarity between vectors
# cosine_similarity helps measure semantic similarity between words
from sklearn.metrics.pairwise import cosine_similarity

# matplotlib is used to visualize word embeddings in 2D space
import matplotlib.pyplot as plt

# **Load Pre-trained Word2Vec (Google News)**

In [6]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model (may take time on first download)
model = api.load("word2vec-google-news-300")

# Print vocabulary size
print("Vocabulary Size:", len(model.key_to_index))

# Display vector for a sample word
word = "king"
vector = model[word]

print("\nWord:", word)
print("Vector length:", len(vector))
print("First 10 values of the vector:\n", vector[:10])

Vocabulary Size: 3000000

Word: king
Vector length: 300
First 10 values of the vector:
 [ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]


# **Load Pre-trained GloVe**

In [7]:
import gensim.downloader as api

# Load GloVe embeddings (100-dimensional)
model = api.load("glove-wiki-gigaword-100")

# Print vocabulary size
print("Vocabulary Size:", len(model.key_to_index))

# Display vector for a sample word
word = "king"
vector = model[word]

print("\nWord:", word)
print("Vector length:", len(vector))
print("First 10 values of the vector:\n", vector[:10])

Vocabulary Size: 400000

Word: king
Vector length: 100
First 10 values of the vector:
 [-0.32307 -0.87616  0.21977  0.25268  0.22976  0.7388  -0.37954 -0.35307
 -0.84369 -1.1113 ]


# **Explore Word Similarity**

In [8]:
import gensim.downloader as api

# Load pre-trained GloVe model (100D)
model = api.load("glove-wiki-gigaword-100")

# Define word pairs
word_pairs = [
   ("pen", "pencil"),
("shirt", "pant"),
("bread", "butter"),
("phone", "charger"),
("table", "chair"),
("lock", "key"),
("milk", "coffee"),
("train", "track"),
("book", "page"),
("rain", "umbrella")

]

print("Word Similarity Scores:\n")

for w1, w2 in word_pairs:
    similarity = model.similarity(w1, w2)
    print(f"{w1} - {w2} : {similarity:.4f}")


Word Similarity Scores:

pen - pencil : 0.6101
shirt - pant : 0.3648
bread - butter : 0.7280
phone - charger : 0.2911
table - chair : 0.4925
lock - key : 0.4264
milk - coffee : 0.6388
train - track : 0.5889
book - page : 0.6754
rain - umbrella : 0.1173


# **Nearest Neighbor Exploration**

In [12]:
import gensim.downloader as api

# Load pre-trained GloVe embeddings (100D)
model = api.load("glove-wiki-gigaword-100")

# Choose at least 5 words
chosen_words = ["pen", "phone", "table", "milk", "rain"]

for word in chosen_words:
    print(f"\nTop similar words for '{word}':\n")

    similar_words = model.most_similar(word, topn=5)

    for similar_word, score in similar_words:
        print(f"{similar_word} : {score:.4f}")



Top similar words for 'pen':

pencil : 0.6101
ballpoint : 0.6023
pens : 0.6020
le : 0.5551
ink : 0.5227

Top similar words for 'phone':

telephone : 0.9113
cellphone : 0.8122
phones : 0.8031
mobile : 0.7307
mail : 0.7292

Top similar words for 'table':

tables : 0.8021
place : 0.6582
bottom : 0.6560
room : 0.6544
side : 0.6434

Top similar words for 'milk':

dairy : 0.7613
meat : 0.7482
sugar : 0.7346
yogurt : 0.6954
juice : 0.6947

Top similar words for 'rain':

rains : 0.8024
snow : 0.7520
winds : 0.7494
downpour : 0.7370
fog : 0.7337


# **Word Analogy Tasks**

In [13]:
import gensim.downloader as api

# Load pre-trained Word2Vec (better for analogies)
model = api.load("word2vec-google-news-300")

# Analogy 1
result1 = model.most_similar(
    positive=["milk", "diary"],
    negative=["sugar"],
    topn=5
)

# Analogy 2
result2 = model.most_similar(
    positive=["pen", "pencil"],
    negative=["ink"],
    topn=5
)

# Analogy 3
result3 = model.most_similar(
    positive=["rain", "snow"],
    negative=["fog"],
    topn=5
)

print("\nmilk - sugar + diary = ?")
print(result1)

print("\npen -ink + pencil = ?")
print(result2)

print("\nrain - fog +  =snow?")
print(result3)



milk - sugar + diary = ?
[('diaries', 0.5703131556510925), ('diary_entries', 0.46095454692840576), ('Diary', 0.457302063703537), ('Diaries', 0.41916531324386597), ('Personall_warns', 0.4183608293533325)]

pen -ink + pencil = ?
[('notepad', 0.5710489749908447), ('pens', 0.5317739844322205), ('pencils', 0.5091125965118408), ('ballpoint_pen', 0.5084912180900574), ('quill_pen', 0.47584688663482666)]

rain - fog +  =snow?
[('heavy_rain', 0.6706271767616272), ('snowfall', 0.6643233299255371), ('rains', 0.6313811540603638), ('precipitation', 0.6059595346450806), ('heavy_rainfall', 0.601680338382721)]
