In [None]:
import nltk
from nltk.corpus import stopwords, gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import re
import pandas as pd


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

raw_text = gutenberg.raw('austen-emma.txt')
raw_text = raw_text.lower()
raw_text = re.sub(r'[^a-zA-Z\s]', '', raw_text)

stop_words = set(stopwords.words('english'))
sentences = sent_tokenize(raw_text)
tokenized_sentences = [
    [word for word in word_tokenize(sent) if word not in stop_words]
    for sent in sentences
]


In [None]:
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5,
                 min_count=2, workers=4, sg=0)  # sg=1 for Skip-gram


In [None]:
print(model.wv.most_similar('emma'))
print(model.wv.similarity('emma', 'harriet'))


In [None]:
words = list(model.wv.index_to_key)[:100]
word_vectors = [model.wv[word] for word in words]

tsne = TSNE(n_components=2, random_state=0, perplexity=15)
reduced_vectors = tsne.fit_transform(word_vectors)

plt.figure(figsize=(12, 8))
for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i][0], reduced_vectors[i][1])
    plt.annotate(word, (reduced_vectors[i][0], reduced_vectors[i][1]))
plt.title('t-SNE Visualization of Word Embeddings')
plt.show()
