# Embeddings & Visualization (Word2Vec, Dendrogram, t‑SNE/UMAP)

Make word meanings visible with embeddings and quick visualizations. Windows‑friendly downloads included.

> Beginner quick start

- Windows: use the Python download cell below (skip the shell one).
- Ensure NLTK tokenizers (`punkt`, `punkt_tab`) if tokenization errors appear.
- Start with the small Word2Vec subset to keep runs fast.

In [None]:
import os
import re
import time
import zipfile
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm import tqdm
from gensim.models import Word2Vec
from scipy.cluster.hierarchy import dendrogram, linkage
from adjustText import adjust_text
from sklearn.manifold import TSNE
from umap import UMAP

## Get IMDB dataset

In [None]:
# Non-Windows (optional): shell-based download/extract
# !wget https://github.com/SalvatoreRa/tutorial/blob/main/datasets/IMDB.zip?raw=true
# !unzip IMDB.zip?raw=true
# df = pd.read_csv('IMDB Dataset.csv')

In [None]:
def ensure_imdb_csv(csv_name='IMDB Dataset.csv', url='https://raw.githubusercontent.com/SalvatoreRa/tutorial/main/datasets/IMDB.zip'):
    if os.path.exists(csv_name):
        return csv_name
    zip_path = 'IMDB.zip'
    print(f'Downloading {url} -> {zip_path} ...')
    urllib.request.urlretrieve(url, zip_path)
    print(f'Extracting {zip_path} ...')
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall('.')
    try:
        os.remove(zip_path)
    except OSError:
        pass
    if not os.path.exists(csv_name):
        raise FileNotFoundError(f
                                )
    return csv_name


csv_path = ensure_imdb_csv()
df = pd.read_csv(csv_path)
df.head()

## Ensure NLTK tokenizers

In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    try:
        nltk.download('punkt_tab')
    except Exception:
        local_dir = os.path.join(os.getcwd(), 'nltk_data')
        os.makedirs(local_dir, exist_ok=True)
        nltk.download('punkt_tab', download_dir=local_dir)
        if local_dir not in nltk.data.path:
            nltk.data.path.append(local_dir)

## Preprocess + tokenize

In [None]:
def preprocessing_reviews(reviews):
    processed = []
    for review in tqdm(reviews):
        review = re.sub('<[^>]+>', '', review)
        review = re.sub('[^a-zA-Z ]', ' ', review)
        words = review.split()
        processed.append(' '.join(w.lower() for w in words if len(w) > 1))
    return processed


df['reviews_processed'] = preprocessing_reviews(df['review'])
df['tokens'] = df['reviews_processed'].apply(nltk.word_tokenize)
df.head()

## Train Word2Vec (quick subset)
- Use a smaller slice to keep runtime short.

In [None]:
start_time = time.time()
tokens_sample = df['tokens'].head(5000).tolist()
model = Word2Vec(sentences=tokens_sample, sg=1,
                 vector_size=100, window=5, workers=4)
print(f'Time needed on subset: {(time.time()-start_time)/60:.2f} mins')

## Dendrogram (hierarchical clustering of a few words)

In [None]:
all_words = list(model.wv.index_to_key)
highlight_words = ['berlin', 'paris', 'london', 'rome', 'italy',
                   'france', 'germany', 'england', 'movie', 'production', 'good', 'bad']
hw = [w for w in [h.lower() for h in highlight_words] if w in all_words]
vecs = np.array([model.wv[w] for w in hw])
linked = linkage(vecs, 'ward')
plt.figure(figsize=(6, 4))
dendrogram(linked, orientation='top', labels=hw,
           distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Words')
plt.ylabel('Euclidean distances')
plt.xticks(rotation=45)
plt.show()

## Optional: t‑SNE (slow on many points) — cap to top‑N words

In [None]:
topN = 2000
all_words = list(model.wv.index_to_key)[:topN]
all_vecs = np.array([model.wv[w] for w in all_words])
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(all_vecs)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=Y[:, 0], y=Y[:, 1], s=6, color='steelblue', alpha=.35)
plt.title('t-SNE of Word2Vec embeddings (top-N)')
plt.show()

## Optional: UMAP (often faster)

In [None]:
um = UMAP(n_components=2, random_state=42)
Y = um.fit_transform(all_vecs)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=Y[:, 0], y=Y[:, 1], s=6, color='darkorange', alpha=.35)
plt.title('UMAP of Word2Vec embeddings (top-N)')
plt.show()