In [1]:
corpus = [
    "fido loves the red ball",
    "timmy and fido go to the park",
    "the quick brown fox jumps over the lazy dog",
    "homework can be frustrating",
    "bayesian training learns from co occurrence",
    "streamlit deploys interactive apps",
    "docker containers run consistent environments",
    "github actions automate ci cd pipelines",
    "hugging face spaces hosts ai demos",
    "embeddings capture semantic relationships",
    "transformers use self attention and feedforward blocks",
    "tokens are the smallest units of text",
    "positional encoding adds sequence information",
    "students use reposage for study assistance",
    "ai chatbots provide twenty four seven support",
]

In [3]:
# bayes_train.ipynb cell 2
import torch
from collections import defaultdict
from nltk.tokenize import word_tokenize

# 1) Build vocab
tokens = set(w for sent in corpus
             for w in word_tokenize(sent.lower()))
token2idx = {w:i for i,w in enumerate(sorted(tokens))}
V = len(token2idx)

# 2) Initialize co-occurrence counts
window = 2
counts = torch.zeros((V,V), dtype=torch.float32)

# 3) Count
for sent in corpus:
    words = word_tokenize(sent.lower())
    for i, cw in enumerate(words):
        for j in range(max(0,i-window), min(len(words),i+window+1)):
            if i!=j:
                counts[token2idx[cw], token2idx[words[j]]] += 1

# 4) Laplace smoothing & conditional probabilities
alpha = 1.0
counts_sm = counts + alpha
row_sums = counts_sm.sum(dim=1, keepdim=True)
probs = counts_sm / row_sums    # shape V×V


In [5]:
# bayes_train.ipynb cell 3
import json, torch.nn as nn

# Create an embedding layer with pretrained weights
emb = nn.Embedding(V, V)
with torch.no_grad():
    emb.weight.copy_(probs)   # each token’s vector = its conditional‐prob row

# Save the new Bayesian embeddings
torch.save(emb.weight, "tensor_bayes.pt")
with open("vocab_bayes.json","w") as f:
    json.dump(token2idx, f)
