## Diffusion

In [1]:
# -------------------------------
# The following code first adds noise to the original embedding, then train a model to predict the noise and denoise
# to restore the original embedding
# -------------------------------

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# -------------------------------
# Parameters
# -------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
T = 100  # number of diffusion steps
embedding_dim = 768  # size of BERT hidden state

# -------------------------------
# Step 1: Prepare BERT Embeddings
# -------------------------------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased').to(device)

text = "The velvet sofa is stylish and modern."
inputs = tokenizer(text, return_tensors='pt').to(device)

with torch.no_grad():
    embeddings = bert(**inputs).last_hidden_state  # [batch, seq_len, hidden_dim]

# -------------------------------
# Step 2: Define noise schedule
# -------------------------------
betas = torch.linspace(1e-4, 0.02, T).to(device)
alphas = 1. - betas
alpha_bars = torch.cumprod(alphas, dim=0)

# -------------------------------
# Step 3: Forward Diffusion
# Adds noise to embeddings
# -------------------------------
def q_sample(x_0, t):
    """Forward diffusion step: add noise to x_0 at timestep t."""
    noise = torch.randn_like(x_0)
    sqrt_alpha_bar = alpha_bars[t] ** 0.5
    sqrt_one_minus_alpha_bar = (1 - alpha_bars[t]) ** 0.5
    return sqrt_alpha_bar * x_0 + sqrt_one_minus_alpha_bar * noise, noise

# Simulate one step (e.g., at t = 50)
t = 50
x_noisy, noise = q_sample(embeddings, t)

# -------------------------------
# Step 4: Define Denoising Model
# -------------------------------
class SimpleDenoiser(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, x):
        return self.net(x)

denoiser = SimpleDenoiser(embedding_dim).to(device)

# -------------------------------
# Step 5: Denoising Step
# -------------------------------
# In practice, you'd train this model to predict `noise`
# given the noisy input and timestep
x_noisy_flat = x_noisy.view(-1, embedding_dim)
predicted_noise = denoiser(x_noisy_flat)

# -------------------------------
# Step 6: Reverse Denoising (One Step)
# -------------------------------
# A training loop would minimize:
# loss = MSE(predicted_noise, true_noise)
loss = F.mse_loss(predicted_noise, noise.view(-1, embedding_dim))
print(f"Loss: {loss.item():.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loss: 1.0377


In [2]:
# Unflatten to get the additional noise added to the original embedding
predicted_noise = predicted_noise.view(x_noisy.shape)

# Denoising formula
sqrt_alpha_bar = alpha_bars[t] ** 0.5
sqrt_one_minus_alpha_bar = (1 - alpha_bars[t]) ** 0.5
x_denoised = (x_noisy - sqrt_one_minus_alpha_bar * predicted_noise) / sqrt_alpha_bar
print(f'The denoised embedding is {x_denoised}')

The denoised embedding is tensor([[[ 6.3832e-01, -6.4926e-01, -1.6240e-02,  ...,  8.6704e-04,
           4.8120e-01,  7.4153e-01],
         [-7.4995e-01, -8.7267e-02, -1.2583e-02,  ...,  2.0927e-01,
           1.6111e+00, -4.0826e-01],
         [ 8.7092e-01, -4.9937e-01,  1.3803e+00,  ..., -1.2767e+00,
           2.1291e-01, -1.9375e-01],
         ...,
         [-7.4830e-01,  5.3956e-01, -3.8354e-02,  ..., -1.0858e+00,
          -8.1842e-01,  3.9139e-01],
         [ 1.0630e+00,  6.7822e-01, -1.1979e+00,  ...,  7.9792e-01,
          -5.9637e-01, -5.8612e-01],
         [ 2.8078e-01, -1.0632e+00, -1.1362e-01,  ...,  4.8461e-01,
          -1.5157e+00, -1.6262e+00]]], grad_fn=<DivBackward0>)
