In [None]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import multiprocessing

In [None]:
print("⏳ Downloading NLTK data...")
nltk.download('punkt')
nltk.download('punkt_tab')
print("✅ NLTK data downloaded successfully!")

⏳ Downloading NLTK data...
✅ NLTK data downloaded successfully!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
corpus = [
    "Word embeddings are a type of word representation.",
    "Word2Vec is a popular word embedding model.",
    "GloVe is another word embedding model.",
    "Both Word2Vec and GloVe capture semantic relationships."
]

print("📜 Corpus defined successfully!")
print("Sample sentences:", corpus[:2])

📜 Corpus defined successfully!
Sample sentences: ['Word embeddings are a type of word representation.', 'Word2Vec is a popular word embedding model.']


In [None]:
try:
    tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]
    print("🔡 Tokenization successful!")
    print("Sample tokenized sentence:", tokenized_corpus[0])
except LookupError:
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_corpus = [tokenizer.tokenize(sentence.lower()) for sentence in corpus]
    print("⚠️ Used fallback tokenizer (NLTK punkt failed).")
    print("Sample tokenized sentence:", tokenized_corpus[0])

🔡 Tokenization successful!
Sample tokenized sentence: ['word', 'embeddings', 'are', 'a', 'type', 'of', 'word', 'representation', '.']


In [None]:
vector_size = 100
window_size = 5
min_count = 1
workers = multiprocessing.cpu_count()
epochs = 100

print("⚙️ Word2Vec parameters set:")
print(f"- Vector size: {vector_size}")
print(f"- Window size: {window_size}")
print(f"- Min word count: {min_count}")
print(f"- Workers: {workers}")
print(f"- Epochs: {epochs}")

⚙️ Word2Vec parameters set:
- Vector size: 100
- Window size: 5
- Min word count: 1
- Workers: 2
- Epochs: 100


In [None]:
vocab = list(word2vec_model.wv.key_to_index.keys())
print(f" Vocabulary size: {len(vocab)} words")
print("Sample words in vocab:", vocab[:5])

if 'word' in word2vec_model.wv:
    print("\n Most similar words to 'word':")
    print(word2vec_model.wv.most_similar("word", topn=3))
else:
    print("'word' not in vocabulary.")
if 'embedding' in word2vec_model.wv:
    print("\n Vector for 'embedding' (first 5 dims):")
    print(word2vec_model.wv["embedding"][:5])
else:
    print("'embedding' not in vocabulary.")

 Vocabulary size: 42 words
Sample words in vocab: ['.', 'word', 'a', 'word2vec', 'by']

 Most similar words to 'word':
[('.', 0.7605836987495422), ('a', 0.7580101490020752), ('representation', 0.7352306246757507)]

 Vector for 'embedding' (first 5 dims):
[ 0.00836866  0.01189717  0.00362558  0.00743285 -0.00138274]


## **GLOVE MODEL**

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-05-08 05:15:31--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-08 05:15:31--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-08 05:15:31--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np

In [None]:
def load_glove_embeddings(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_path = 'glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)

In [None]:
print(f"Loaded {len(glove_embeddings)} word vectors")
print("Vector for 'king':", glove_embeddings['king'][:5])
print("Most similar to 'paris':", sorted(
    [(word, np.dot(glove_embeddings['paris'], glove_embeddings[word]))
     for word in ['france', 'london', 'berlin']],
    key=lambda x: -x[1]
))

Loaded 400000 word vectors
Vector for 'king': [-0.32307 -0.87616  0.21977  0.25268  0.22976]
Most similar to 'paris': [('france', 31.536636), ('london', 29.39053), ('berlin', 23.307777)]
