
# Word2Vec Word Embedding Experiment

This experiment demonstrates how to generate word embeddings using the Word2Vec model for text similarity analysis. We'll use a sample corpus, preprocess the text, train the model, and visualize the resulting embeddings using PCA.


In [None]:

!pip install gensim nltk matplotlib scikit-learn
import nltk
nltk.download('punkt')
nltk.download('stopwords')


In [None]:

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

corpus = [
    "Machine learning is a method of data analysis that automates analytical model building.",
    "Artificial intelligence is the simulation of human intelligence in machines.",
    "Natural language processing enables computers to understand human language.",
    "Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation.",
    "Word2Vec is one of the most popular techniques to learn word embeddings using shallow neural networks."
]

stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t not in stop_words and t not in string.punctuation]

tokenized_corpus = [preprocess(sentence) for sentence in corpus]
tokenized_corpus


In [None]:

from gensim.models import Word2Vec

model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)


In [None]:

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

words = list(model.wv.index_to_key)
word_vectors = [model.wv[word] for word in words]

pca = PCA(n_components=2)
result = pca.fit_transform(word_vectors)

plt.figure(figsize=(10, 8))
plt.scatter(result[:, 0], result[:, 1])

for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))

plt.title('Word2Vec Word Embeddings Visualization (PCA)')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.grid(True)
plt.show()
