# NLP Playground

Interactive notebook: tokenization, embeddings, 2D visualization, and similarity search.

Uses `sentence-transformers` for embeddings and `transformers` tokenizers for tokenization. UI is built with `ipywidgets`.

In [None]:
# Install required packages (run once)
!pip install -q sentence-transformers transformers tokenizers scikit-learn matplotlib plotly ipywidgets

# If using JupyterLab you might need to install the ipywidgets labextension separately.
print('Install finished (if output above shows success). Restart kernel if required.')

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, Markdown
import math

print('Imports OK')

In [None]:
# Load default embedding and tokenizer
EMBED_MODEL_NAME = 'all-MiniLM-L6-v2'  # small, fast, good for demos
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained('gpt2', use_fast=True)
print('Loaded embedding model and tokenizer')

## Utility functions

In [None]:
def tokenize_text(text, model_tokenizer=tokenizer):
    """Return tokens and token ids using the selected tokenizer."""
    enc = model_tokenizer(text, return_tensors=None)
    # For GPT2 tokenizer object returns 'input_ids' list
    ids = enc.get('input_ids')
    if ids is None:
        # fallback to encode single string
        ids = model_tokenizer.encode(text)
    # decode tokens individually (fast) -- for some tokenizers you can use tokenizer.convert_ids_to_tokens
    try:
        tokens = model_tokenizer.convert_ids_to_tokens(ids)
    except Exception:
        tokens = [str(i) for i in ids]
    return tokens, ids

def embed_texts(texts):
    """Return sentence-transformer embeddings (numpy array)."""
    embs = embed_model.encode(texts, convert_to_numpy=True)
    return embs

def reduce_dim(embs, method='pca', n_components=2, random_state=42):
    if embs.shape[0] <= 2:
        # nothing to reduce
        return embs[:, :n_components]
    if method == 'pca':
        p = PCA(n_components=n_components, random_state=random_state)
        return p.fit_transform(embs)
    else:
        tsne = TSNE(n_components=n_components, random_state=random_state, init='pca')
        return tsne.fit_transform(embs)

def top_k_sim(query_emb, emb_matrix, k=5):
    sims = cosine_similarity([query_emb], emb_matrix)[0]
    idx = np.argsort(sims)[::-1][:k]
    return idx, sims[idx]

## Sample corpus
We'll include a small default corpus you can extend.

In [None]:
sample_corpus = [
    'I love machine learning and natural language processing.',
    'Transformers are great for NLP tasks like translation and summarization.',
    'The quick brown fox jumps over the lazy dog.',
    'PyTorch and TensorFlow are popular deep learning frameworks.',
    'Artificial intelligence will change how we work and learn.'
]

corpus_embs = embed_texts(sample_corpus)
print('Sample corpus embedded, shape:', corpus_embs.shape)

## UI: Tokenize / Embed / Visualize / Similarity search

In [None]:
# Widgets
text_in = widgets.Textarea(value='Enter text here', layout=widgets.Layout(width='80%', height='80px'))
btn_tokenize = widgets.Button(description='Tokenize')
btn_embed = widgets.Button(description='Embed & Visualize')
method_dropdown = widgets.Dropdown(options=['pca', 'tsne'], value='pca', description='Reduce:')
k_slider = widgets.IntSlider(value=3, min=1, max=10, description='Top k:')
query_in = widgets.Text(value='', placeholder='Type an arbitrary query to search the corpus', description='Query:')
btn_query = widgets.Button(description='Search Corpus')

out = widgets.Output()

def on_tokenize(b):
    with out:
        out.clear_output()
        txt = text_in.value
        tokens, ids = tokenize_text(txt)
        display(Markdown(f"**Text:** {txt}"))
        display(Markdown(f"**Tokens ({len(tokens)}):** `{tokens}`"))
        display(Markdown(f"**Token IDs:** `{ids}`"))

def on_embed(b):
    with out:
        out.clear_output()
        txt = text_in.value
        embs = embed_texts([txt] + sample_corpus)
        reduced = reduce_dim(embs, method=method_dropdown.value)
        df = {
            'label': ['INPUT'] + [f'CORP_{i}' for i in range(len(sample_corpus))],
            'text': [txt] + sample_corpus,
            'x': reduced[:,0],
            'y': reduced[:,1]
        }
        fig = px.scatter(df, x='x', y='y', hover_name='label', hover_data=['text'])
        fig.update_layout(title='2D visualization of input + corpus (reduced)')
        fig.show()

def on_query(b):
    with out:
        out.clear_output()
        q = query_in.value
        if len(q.strip()) == 0:
            print('Enter a query to search the corpus')
            return
        q_emb = embed_texts([q])[0]
        idxs, sims = top_k_sim(q_emb, corpus_embs, k=k_slider.value)
        for rank,(i,s) in enumerate(zip(idxs, sims), start=1):
            display(Markdown(f"**{rank}.** (score={s:.4f}) â€” {sample_corpus[i]}"))

btn_tokenize.on_click(on_tokenize)
btn_embed.on_click(on_embed)
btn_query.on_click(on_query)

controls = widgets.HBox([btn_tokenize, btn_embed, method_dropdown])
search_row = widgets.HBox([query_in, k_slider, btn_query])
display(text_in, controls, search_row, out)


## Extend the corpus
You can append to `sample_corpus` and re-run the embedding steps. Try pasting paragraphs, questions, or short documents.