In [None]:
# Import necessary libraries
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Sample corpus
corpus = [
    "Natural language processing is a field of artificial intelligence.",
    "It enables computers to understand human language.",
    "Word embedding is a representation of words in a dense vector space.",
    "Gensim is a library for training word embeddings in Python.",
    "Machine learning and deep learning techniques are widely used in NLP."
]

# Preprocess the text: Tokenize, remove punctuation and stopwords
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

# Apply preprocessing to the corpus
processed_corpus = [preprocess_text(sentence) for sentence in corpus]

# Train a Word2Vec model
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=2, min_count=1, sg=1)  # sg=1 uses Skip-gram

# Save the model for future use
model.save("word2vec_model.model")

# Test the model by finding the embedding of a word
word = "vector"
if word in model.wv:
    print(f"Embedding for '{word}':\n{model.wv[word]}")
else:
    print(f"'{word}' not found in vocabulary.")

# Find similar words
similar_words = model.wv.most_similar(word, topn=2)
print(f"Words similar to '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.4f}")


Embedding for 'vector':
[ 0.00180023  0.00704609  0.0029447  -0.00698085  0.00771268 -0.00598893
  0.00899771  0.0029592  -0.00401529 -0.00468899 -0.00441672 -0.00614646
  0.00937874 -0.0026496   0.00777244 -0.00968034  0.00210879 -0.00123361
  0.00754423 -0.0090546   0.00743756 -0.0051058  -0.00601377 -0.00564916
 -0.00337917 -0.0034111  -0.00319566 -0.0074922   0.00070878 -0.00057607
 -0.001684    0.00375713 -0.00762019 -0.00322142  0.00515534  0.00854386
 -0.00980994  0.00719534  0.00530949 -0.0038797   0.00857616 -0.00922199
  0.00724868  0.00536383  0.00129359 -0.00519975 -0.00417865 -0.00335678
  0.00160829  0.0015867   0.00738824  0.00997759  0.00886734 -0.00400645
  0.00964539 -0.00062954  0.00486543  0.00254902 -0.00062981  0.00366745
 -0.00531941 -0.00575668 -0.00760464  0.00190643  0.00652587  0.00088213
  0.00125695  0.0031716   0.00813467 -0.00770006  0.00226075 -0.00747411
  0.00370981  0.00951055  0.00752026  0.00642603  0.00801478  0.00655115
  0.00685668  0.00868209 -0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
