<a href="https://colab.research.google.com/github/Sebastianrix/FullStack-DotNetAPI-ReactFrontend/blob/main/WordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from pathlib import Path


In [8]:
DATA_DIR = Path("/")
HCA_PATH = DATA_DIR / "HCA.txt"
VOCAB_PATH = DATA_DIR / "HCA_vocabulary.txt"

HCA_PATH.exists(), VOCAB_PATH.exists()

(True, True)

In [9]:
# Load sorted vocabulary list (one word per line)
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
    vocab_list = [line.strip() for line in f if line.strip()]

# Build lookup tables
word_to_id = {w: i for i, w in enumerate(vocab_list)}
id_to_word = {i: w for w, i in word_to_id.items()}

print("Vocab size:", len(vocab_list))
print("Contains ENDPERIOD:", "ENDPERIOD" in word_to_id)

Vocab size: 12273
Contains ENDPERIOD: True


In [10]:
# HCA.txt is space/newline separated words, already preprocessed
with open(HCA_PATH, "r", encoding="utf-8") as f:
    corpus_words = f.read().split()

print("Total tokens:", len(corpus_words))
print("Sample:", corpus_words[:25])

Total tokens: 396672
Sample: ['in', 'the', 'garden', 'all', 'the', 'apple', 'trees', 'were', 'in', 'blossom', 'ENDPERIOD', 'they', 'had', 'hastened', 'to', 'bring', 'forth', 'flowers', 'before', 'they', 'got', 'green', 'leaves', 'and', 'in']


In [11]:
# Optional: verify all words are in vocab (corpus should match the provided vocabulary)
missing = {w for w in set(corpus_words) if w not in word_to_id}
print("Out-of-vocab words:", len(missing))
# Expect 0; if not, you can decide on an UNK policy.

# Convert entire corpus to integer ids
import numpy as np
corpus_ids = np.fromiter((word_to_id[w] for w in corpus_words), dtype=np.int32)

print("corpus_ids shape:", corpus_ids.shape)
print("First 20 ids:", corpus_ids[:20])


Out-of-vocab words: 0
corpus_ids shape: (396672,)
First 20 ids: [ 5531 10857  4474   250 10857   419 11191 11908  5531  1079  3510 10881
  4913  5023 11045  1290  4287  4167   865 10881]


In [14]:
# --- paths ---
VOCAB_PATH = "/HCA_vocabulary.txt"
TEXT_PATH  = "/HCA.txt"

# --- load vocab (fixed order) ---
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
    vocab = [w.strip() for w in f if w.strip()]
word2id = {w:i for i,w in enumerate(vocab)}
id2word = {i:w for w,i in word2id.items()}
V = len(vocab)  # should be 12273 in your set  :contentReference[oaicite:9]{index=9}

# --- load tokens ---
with open(TEXT_PATH, "r", encoding="utf-8") as f:
    # file is space/newline separated tokens incl. ENDPERIOD  :contentReference[oaicite:10]{index=10}
    tokens = f.read().split()

# --- map to ids ---
ids = [word2id[w] for w in tokens if w in word2id]  # all should be in vocab

# --- build sliding-window dataset ---
import numpy as np, tensorflow as tf

n = 10          # context length (tune)
EMB_D = 128     # embedding dim
BATCH = 256     # tune
BUFFER = 10000

X = []
Y = []
for i in range(len(ids) - n):
    X.append(ids[i:i+n])
    Y.append(ids[i+n])
X = np.array(X, dtype=np.int32)
Y = np.array(Y, dtype=np.int32)

ds = tf.data.Dataset.from_tensor_slices((X, Y)).shuffle(BUFFER).batch(BATCH).prefetch(tf.data.AUTOTUNE)


Model 1

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L

def build_mlp_nextword(V, n, EMB_D, hidden=256):
    inputs = keras.Input(shape=(n,), dtype="int32")
    x = L.Embedding(V, EMB_D, input_length=n)(inputs)   # (batch, n, EMB_D)
    x = L.Flatten()(x)                                  # (batch, n*EMB_D)
    x = L.Dense(hidden, activation="relu")(x)
    x = L.Dropout(0.2)(x)
    outputs = L.Dense(V, activation="softmax")(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["sparse_categorical_accuracy"],
    )
    return model

mlp = build_mlp_nextword(V, n, EMB_D)
mlp.summary()
# mlp.fit(ds, epochs=5)




In [20]:
mlp.fit(ds, epochs=5)

Epoch 1/5
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 245ms/step - loss: 6.2917 - sparse_categorical_accuracy: 0.1105
Epoch 2/5
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 252ms/step - loss: 5.2540 - sparse_categorical_accuracy: 0.1616
Epoch 3/5
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 240ms/step - loss: 4.9138 - sparse_categorical_accuracy: 0.1816
Epoch 4/5
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 242ms/step - loss: 4.6244 - sparse_categorical_accuracy: 0.1970
Epoch 5/5
[1m1550/1550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 243ms/step - loss: 4.3454 - sparse_categorical_accuracy: 0.2148


<keras.src.callbacks.history.History at 0x7974d91bb140>

In [27]:
import numpy as np

# Choose a PAD id. If you have '<PAD>' in your vocab, use that. Otherwise 0 is common.
PAD_ID = word2id.get("<PAD>", 0)
UNK_ID = word2id.get("<UNK>", PAD_ID)  # fallback for unknown words

def ids_from_words(words):
    return [word2id.get(w, UNK_ID) for w in words]

def make_ctx_ids(seed_ids, n, pad_id=PAD_ID):
    """Left-pad/truncate to exactly length n."""
    if len(seed_ids) < n:
        seed_ids = [pad_id] * (n - len(seed_ids)) + seed_ids
    else:
        seed_ids = seed_ids[-n:]
    return np.array(seed_ids, dtype=np.int32)[None, :]  # shape (1, n)

def top_k_sample(probs, k=10, temperature=1.0):
    probs = np.asarray(probs, dtype=np.float64)
    # avoid k > vocab
    k = min(k, probs.shape[0])

    if temperature != 1.0:
        logits = np.log(probs + 1e-9) / temperature
        probs = np.exp(logits)
        probs = probs / probs.sum()

    idxs = np.argpartition(probs, -k)[-k:]
    sub = probs[idxs]
    sub = sub / sub.sum()
    return int(np.random.choice(idxs, p=sub))

def generate(model, seed_words, steps=50, k=10, T=1.0):
    ctx_ids = ids_from_words(seed_words)
    out_words = seed_words[:]  # start output with the seed

    for _ in range(steps):
        x = make_ctx_ids(ctx_ids, n=n, pad_id=PAD_ID)  # shape (1, n)
        p = model.predict(x, verbose=0)[0]            # shape (V,)

        wid = top_k_sample(p, k=k, temperature=T)
        out_words.append(id2word[wid])
        ctx_ids.append(wid)  # grow context; make_ctx_ids will keep last n

    return out_words

# Example:
print(" ".join(generate(mlp, ["in", "the", "garden"], steps=30, k=10, T=1.0)))


in the garden the little girl a little man but a great bird had come on my own room and was gone with the old house in the snow ENDPERIOD the wind shone


In [31]:
import numpy as np

# ---- Config pulled from your existing setup ----
# Assumes you already have: mlp (model), n (context length), V (vocab size), word2id, id2word
PAD_ID = word2id.get("<PAD>", 0)
UNK_ID = word2id.get("<UNK>", PAD_ID)

# ---- Token helpers ----
def ids_from_words(words, word2id, unk_id=0):
    return [word2id.get(w, unk_id) for w in words]

def make_ctx_ids(ctx_ids, n, pad_id=0):
    if len(ctx_ids) < n:
        ctx_ids = [pad_id] * (n - len(ctx_ids)) + ctx_ids
    else:
        ctx_ids = ctx_ids[-n:]
    return np.array(ctx_ids, dtype=np.int32)[None, :]  # (1, n)

# ---- Sampling ----
def top_k_sample(probs, k=10, temperature=1.0):
    probs = np.asarray(probs, dtype=np.float64)
    k = max(1, min(k, probs.shape[0]))  # clamp k
    if temperature != 1.0:
        logits = np.log(probs + 1e-9) / temperature
        probs = np.exp(logits); probs /= probs.sum()
    idxs = np.argpartition(probs, -k)[-k:]
    sub = probs[idxs]
    sub = sub / sub.sum()
    return int(np.random.choice(idxs, p=sub))

def greedy_sample(probs):
    return int(np.argmax(probs))

# ---- Special tokens & detokenizer ----
STOP_TOKENS = {"ENDPERIOD"}  # stop early if you want
PUNCT_MAP = {
    "ENDPERIOD": ".",
    "ENDCOMMA": ",",
    "ENDQUESTION": "?",
    "ENDCOLON": ":",
    "ENDSEMICOLON": ";",
    "ENDQUOTE": '"',
    "QUOTE": '"',
}

def detokenize(words):
    out = []
    for w in words:
        if w in PUNCT_MAP:
            if out:
                out[-1] = out[-1] + PUNCT_MAP[w]
            else:
                out.append(PUNCT_MAP[w])
        else:
            out.append(w)
    text = " ".join(out).strip()
    for p in [".", ",", "?", ":", ";"]:
        text = text.replace(" " + p, p)
    text = " ".join(text.split())
    if text:
        text = text[0].upper() + text[1:]
    if text and text[-1] not in ".!?":
        text += "."
    return text

# ---- Core generators (NO retrain needed) ----
def generate_tokens(
    model,
    seed_words,
    n,
    word2id,
    id2word,
    steps=50,
    strategy="topk",    # "topk" or "greedy"
    k=10,
    T=1.0,
    pad_id=0,
    stop_on_special=True
):
    ctx_ids = ids_from_words(seed_words, word2id, unk_id=UNK_ID)
    out_words = seed_words[:]
    for _ in range(steps):
        x = make_ctx_ids(ctx_ids, n=n, pad_id=pad_id)
        p = model.predict(x, verbose=0)[0]  # shape (V,)
        wid = top_k_sample(p, k=k, temperature=T) if strategy == "topk" else greedy_sample(p)
        w = id2word[wid]
        out_words.append(w)
        ctx_ids.append(wid)
        if stop_on_special and w in STOP_TOKENS:
            break
    return out_words

def generate_text(
    model,
    seed_words,
    steps=50,
    strategy="topk",
    k=10,
    T=1.0,
    stop_on_special=True
):
    words = generate_tokens(
        model, seed_words, n=n, word2id=word2id, id2word=id2word,
        steps=steps, strategy=strategy, k=k, T=T,
        pad_id=PAD_ID, stop_on_special=stop_on_special
    )
    return detokenize(words)

# ---- Batch testing helpers ----
def try_many(
    prompts,
    runs=3,
    steps=30,
    strategy="topk",
    k=10,
    T=1.0
):
    for seed in prompts:
        print(f"\nSeed: {seed}")
        for r in range(1, runs+1):
            txt = generate_text(mlp, seed, steps=steps, strategy=strategy, k=k, T=T)
            print(f"  {r:>2}: {txt}")

# ---- Quick sanity checks ----
print(generate_text(mlp, ["could", "it", "be"], steps=30, strategy="topk", k=10, T=1.0))
print(generate_text(mlp, ["in", "the", "garden"], steps=30, strategy="greedy"))

# Try many variants fast:
try_many(
    prompts=[
        ["could", "it", "be"],
        ["once", "upon", "a", "time"],
        ["the", "king"],
    ],
    runs=3,
    steps=25,
    strategy="topk",
    k=10,
    T=0.9
)


Could it be a little and in this time they had been in the wood and the old man had been born and the old man who had been to have the man.
In the garden the the old man and the old woman was the most beautiful and the wind had been a great deal of the old man and the old woman was a.

Seed: ['could', 'it', 'be']
   1: Could it be a good little girl the old woman and then there are so beautiful that i have taken away.
   2: Could it be the most beautiful.
   3: Could it be the little in the air where his wife.

Seed: ['once', 'upon', 'a', 'time']
   1: Once upon a time the old man had been seen by a year and the little man had fallen into the world but she could not bear him.
   2: Once upon a time the light and the old woman had come and the sun had taken up and he looked at the duckling so.
   3: Once upon a time and the old man was the most beautiful.

Seed: ['the', 'king']
   1: The king the only a great deal of all and the wind might go about it.
   2: The king the mother the the 

In [34]:
print(generate_text(mlp, ["the", "od", "man"], steps=30))


The od man the old old man and the little boy that it was the most lovely old bird.


In [23]:
import numpy as np

def top_k_sample(probs, k=10, temperature=1.0):
    probs = np.asarray(probs, dtype=np.float64)
    if temperature != 1.0:
        # temperature scaling
        logits = np.log(probs + 1e-9) / temperature
        probs = np.exp(logits)
        probs = probs / probs.sum()
    idxs = np.argpartition(probs, -k)[-k:]
    sub = probs[idxs]
    sub = sub / sub.sum()
    return int(np.random.choice(idxs, p=sub))

def generate(model, seed_words, steps=50, k=10, T=1.0):
    ctx = [word2id[w] for w in seed_words]
    out = seed_words[:]
    for _ in range(steps):
        x = np.array(ctx[-n:], dtype=np.int32)[None, :]
        p = model.predict(x, verbose=0)[0]
        wid = top_k_sample(p, k=k, temperature=T)
        out.append(id2word[wid])
        ctx.append(wid)
    return out

#Example:
print(" ".join(generate(mlp, ["in", "the", "garden"], steps=30)))


ValueError: Input 0 of layer "functional" is incompatible with the layer: expected shape=(None, 10), found shape=(1, 3)

In [17]:
def build_lstm_nextword(V, n, EMB_D, hidden=256):
    inputs = keras.Input(shape=(n,), dtype="int32")
    x = L.Embedding(V, EMB_D, input_length=n)(inputs)
    x = L.LSTM(hidden)(x)
    x = L.Dropout(0.2)(x)
    outputs = L.Dense(V, activation="softmax")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["sparse_categorical_accuracy"])
    return model

lstm = build_lstm_nextword(V, n, EMB_D)
lstm.summary()
# lstm.fit(ds, epochs=5)


In [18]:
import tensorflow as tf
from tensorflow.keras import layers as L, Model

class EmbeddingRegressor(Model):
    def __init__(self, V, D, n, hidden=256):
        super().__init__()
        self.emb = L.Embedding(V, D, input_length=n, name="word_emb")
        self.encoder = L.LSTM(hidden)
        self.head = L.Dense(D, name="pred_vec")  # predict embedding
        # We’ll compute loss manually vs. true embedding vectors

    def call(self, x):
        x = self.emb(x)           # (batch, n, D)
        x = self.encoder(x)       # (batch, H)
        y = self.head(x)          # (batch, D)
        return y

reg = EmbeddingRegressor(V, EMB_D, n)

# Build a tf.data pipeline that also yields the *embedding* target
def ds_for_embedding(ids, n, batch):
    X, Y = [], []
    for i in range(len(ids)-n):
        X.append(ids[i:i+n])
        Y.append(ids[i+n])
    X = tf.constant(X, dtype=tf.int32)
    Y = tf.constant(Y, dtype=tf.int32)
    ds = tf.data.Dataset.from_tensor_slices((X, Y)).shuffle(BUFFER).batch(batch)

    # map Y -> embedding vector
    def to_vec(batch_x, batch_y):
        # gather rows from the embedding table
        emb_table = reg.emb.weights[0]  # (V, D) after first build/trace
        y_vec = tf.gather(emb_table, batch_y)
        return batch_x, y_vec
    # force build to create weights
    _ = reg(tf.zeros((1, n), dtype=tf.int32))
    return ds.map(to_vec).prefetch(tf.data.AUTOTUNE)

ds_vec = ds_for_embedding(ids, n, BATCH)

reg.compile(optimizer="adam", loss="mse")
# reg.fit(ds_vec, epochs=5)


In [19]:
import numpy as np

# Precompute embedding table once
E = reg.emb.get_weights()[0]    # (V, D)

def nearest_ids(vec, topk=10):
    # cosine similarity (fast and scale-invariant)
    v = vec / (np.linalg.norm(vec, axis=-1, keepdims=True) + 1e-9)
    E_norm = E / (np.linalg.norm(E, axis=-1, keepdims=True) + 1e-9)
    sims = E_norm @ v.T   # (V, batch) or (V,) if v is (D,)
    idxs = np.argpartition(-sims[:,0], topk)[:topk]
    # sort topk
    idxs = idxs[np.argsort(-sims[idxs, 0])]
    return idxs

def generate_by_vec(model, seed_words, steps=50, topk=10):
    ctx = [word2id[w] for w in seed_words]
    out = seed_words[:]
    for _ in range(steps):
        x = np.array(ctx[-n:], dtype=np.int32)[None, :]
        y_vec = model.predict(x, verbose=0)          # (1, D)
        cands = nearest_ids(y_vec.T, topk=topk)      # (topk,)
        wid = int(np.random.choice(cands))
        out.append(id2word[wid])
        ctx.append(wid)
    return out
