In [1]:
!pip install -U git+https://github.com/Sakib323/AI-Game-Engine.git
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers
!pip install triton==3.2.0
!pip install datasets
!pip install wandb
!pip install -U datasets
!pip install triton==2.3.1

Collecting git+https://github.com/Sakib323/AI-Game-Engine.git
  Cloning https://github.com/Sakib323/AI-Game-Engine.git to /tmp/pip-req-build-f_lfe97n
  Running command git clone --filter=blob:none --quiet https://github.com/Sakib323/AI-Game-Engine.git /tmp/pip-req-build-f_lfe97n
  Resolved https://github.com/Sakib323/AI-Game-Engine.git to commit c594553f4bd5300122bc54168c61bfb1a6527ea3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ninja (from ai_game_engine==0.1)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.0->ai_game_engine==0.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.0->ai_game_engine==0.1)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.1

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
from diffusers import AutoencoderKL
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
from timm.models.vision_transformer import PatchEmbed
from torchvision import transforms
import torch.nn as nn
# --- Config ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_size = 512
latent_downscale = 8
latent_size = image_size // latent_downscale
embed_dim = 1024
patch_size = 4
batch_size = 8

vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="vae").to(device).eval()
betas = get_named_beta_schedule("linear", 1000)
diffusion = GaussianDiffusion(betas=betas, model_mean_type=ModelMeanType.EPSILON, model_var_type=ModelVarType.FIXED_SMALL, loss_type=LossType.MSE)
patch_embed = PatchEmbed(img_size=latent_size, patch_size=patch_size, in_chans=4, embed_dim=embed_dim).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

def preprocess(example):
    image = transform(example['image'].convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        latents = vae.encode(image).latent_dist.sample() * vae.config.scaling_factor
        latents = nn.functional.interpolate(latents, size=(latent_size, latent_size), mode='bilinear')
    t = torch.randint(0, 1000, (1,), device=device).long()
    noise = torch.randn_like(latents)
    x_noisy = diffusion.q_sample(x_start=latents, t=t, noise=noise)
    with torch.no_grad():
        patches = patch_embed(x_noisy).squeeze(0)
    tokens = tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=2048,
        return_tensors='pt'
    )
    return {
        'patch_embeddings': patches.detach().cpu(),
        'noise': noise.squeeze(0).detach().cpu(),
        'timestep': t.item(),
        'input_ids': tokens['input_ids'].squeeze(0),
        'attention_mask': tokens['attention_mask'].squeeze(0),
    }

dataset = load_dataset("iamkaikai/GAME-MAP-ART", split="train")
processed_data = [preprocess(example) for example in dataset]

class DiTDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    return {
        'patch_embeddings': torch.stack([x['patch_embeddings'] for x in batch]),
        'noise': torch.stack([x['noise'] for x in batch]),
        'timestep': torch.tensor([x['timestep'] for x in batch], dtype=torch.long),
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
    }

dataloader = DataLoader(DiTDataset(processed_data), batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate_fn)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/318 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/67.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1101 [00:00<?, ? examples/s]

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from mmfreelm.models import (HGRNBitModel, HGRNBitConfig, TerneryDit)
from mmfreelm.modules import RMSNorm
from mmfreelm.ops.fusedbitnet import FusedBitLinear as BitLinear


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and set example values
tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token
embed_dim = 1024  # Must match the output dimension of your patch embedding
latent_size = 64   # For 512px image with 8x downscale (512/8=64)
patch_size = 4

# Text model config
text_config = HGRNBitConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=768,
    num_hidden_layers=12,
    attn_mode="fused_recurrent",
    rotary_embeddings=True,
    condition_dim=256,
)

# Diffusion model config (without condition_dim first)
diffusion_config = HGRNBitConfig(
    hidden_size=embed_dim,
    num_hidden_layers=24,
    attn_mode="fused_recurrent",
    rotary_embeddings=True,
    max_position_embeddings=(latent_size // patch_size) ** 2,  # (64//4)**2 = 256
)

# Now set condition_dim
diffusion_config.condition_dim = text_config.hidden_size + diffusion_config.hidden_size

# Initialize model
model = TerneryDit(
    text_config=text_config,
    diffusion_config=diffusion_config,
    num_timesteps=1000,
    patch_dim=4 * patch_size * patch_size  # 4 channels * 4*4 patch
).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training hyperparameters
num_epochs = 10  # SET YOUR NUMBER OF EPOCHS HERE

# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        # Move data to device
        patch_emb = batch['patch_embeddings'].to(device)
        timesteps = batch['timestep'].to(device)
        input_ids = batch['input_ids'].to(device)
        attn_mask = batch['attention_mask'].to(device)
        target_noise = batch['noise'].to(device)

        # Forward pass - note: using correct parameter names
        pred_noise = model(
            patch_embeddings=patch_emb,
            timesteps=timesteps,
            input_ids=input_ids,
            attention_mask=attn_mask  # CHANGED: using attention_mask parameter name
        )

        # Reshape predictions
        B, N, C = pred_noise.shape
        H = W = int(N**0.5)  # Assuming square patches
        pred_noise = pred_noise.view(B, H, W, C).permute(0, 3, 1, 2)

        # Compute loss
        loss = criterion(pred_noise, target_noise)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

In [None]:
import numpy as np
from torchvision.utils import make_grid

# Load a pre-trained VAE to decode latents to images (256x256 example)
vae = AutoModel.from_pretrained("stabilityai/sd-vae-ft-ema").to(device)

def sample_images(prompts, model, tokenizer, text_encoder, vae, num_steps=50):
    model.eval()
    batch_size = len(prompts)
    # Encode text prompts
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)
    text_embeds = text_encoder(**inputs).last_hidden_state

    # Assume latent dimension from VAE (e.g., 4x64x64 for 256x256 images)
    latent_dim = (4, 64, 64)
    # Sample initial Gaussian noise (shape: batch x channels x H x W)
    latents = torch.randn((batch_size, *latent_dim), device=device)

    # Simple DDPM schedules (for illustration; actual beta schedule may vary)
    betas = torch.linspace(0.0001, 0.02, num_steps, device=device)
    alphas = 1 - betas
    alphas_cum = torch.cumprod(alphas, dim=0)

    for i in reversed(range(num_steps)):
        t = torch.tensor([i], dtype=torch.long, device=device)
        # Predict noise
        with torch.no_grad():
            pred_noise = model(latents, t, encoder_hidden_states=text_embeds)
        # Compute predicted previous latent (DDPM update rule)
        alpha = alphas_cum[i]
        beta = betas[i]
        # Equation: x_{t-1} ≈ (latents - sqrt(1-alpha)*pred_noise) / sqrt(alpha)
        latents = (latents - torch.sqrt(1 - alpha) * pred_noise) / torch.sqrt(alpha)
        # (Optional) Add small noise if i > 0
        if i > 0:
            noise = torch.randn_like(latents)
            latents += torch.sqrt(beta) * noise

    # Decode latents to images via VAE
    with torch.no_grad():
        images = vae.decode(latents).sample  # VAE decoder returns a dict with 'sample'

    # Clamp and convert to CPU for visualization
    images = (images.clamp(-1, 1) + 1) / 2
    images = images.cpu()
    # Make a grid or save images
    grid = make_grid(images, nrow=len(prompts))
    return grid

# Example usage:
prompts = ["A sunset over mountains", "A futuristic city skyline"]
grid = sample_images(prompts, model, tokenizer, text_encoder, vae)
# Save or display 'grid' tensor as an image


In [None]:
import torch
import torch.nn as nn
import numpy as np
from torchvision import transforms
from PIL import Image
from datasets import load_dataset
from transformers import AutoTokenizer
from diffusers import AutoencoderKL
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
from timm.models.vision_transformer import PatchEmbed

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_size = 512
latent_downscale = 8
latent_size = image_size // latent_downscale
embed_dim = 1024
patch_size = 4

vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="vae").to(device)
vae.eval()

num_timesteps = 1000
betas = get_named_beta_schedule("linear", num_timesteps)
diffusion = GaussianDiffusion(
    betas=betas,
    model_mean_type=ModelMeanType.EPSILON,
    model_var_type=ModelVarType.FIXED_SMALL,
    loss_type=LossType.MSE
)

patch_embed = PatchEmbed(
    img_size=latent_size,
    patch_size=patch_size,
    in_chans=4,
    embed_dim=embed_dim
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token


dataset = load_dataset("iamkaikai/GAME-MAP-ART")['train']

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

def preprocess(example):

    image = example['image'].convert("RGB")
    image_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        latents = vae.encode(image_tensor).latent_dist.sample() * vae.config.scaling_factor
        latents = nn.functional.interpolate(latents, size=(latent_size, latent_size), mode='bilinear')

    t = torch.randint(0, num_timesteps, (1,), device=device).long()
    noise = torch.randn_like(latents)
    x_noisy = diffusion.q_sample(x_start=latents, t=t, noise=noise)

    with torch.no_grad():
        patches = patch_embed(x_noisy).squeeze(0).cpu().numpy()

    tokens = tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=2048,
        return_tensors='np'
    )

    return {
        'patch_embeddings': patches.astype(np.float32),
        'timestep': int(t.item()),
        'noise': noise.squeeze(0).cpu().numpy().astype(np.float32),
        'input_ids': tokens['input_ids'][0],
        'attention_mask': tokens['attention_mask'][0],
    }

dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
dataset.set_format(type='numpy')

In [None]:
import torch as th
import numpy as np
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
from diffusers import AutoencoderKL
import matplotlib.pyplot as plt

device = "cuda" if th.cuda.is_available() else "cpu"
vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="vae").to(device)
vae.eval()

num_timesteps = 1000
betas = get_named_beta_schedule("linear", num_timesteps)
diffusion = GaussianDiffusion(
    betas=betas,
    model_mean_type=ModelMeanType.EPSILON,
    model_var_type=ModelVarType.FIXED_SMALL,
    loss_type=LossType.MSE
)

batch_size, channels, height, width = 8, 3, 256, 256
x_start = th.randn((batch_size, channels, height, width), device=device)
x_start = th.clamp(x_start, -1, 1)
with th.no_grad():
    latent_dist = vae.encode(x_start).latent_dist
    latent = latent_dist.sample() * vae.config.scaling_factor
t = th.tensor([500] * batch_size, device=device, dtype=th.long)
noise = th.randn_like(latent)
x_noisy = diffusion.q_sample(x_start=latent, t=t, noise=noise)


In [None]:
import torch
import numpy as np
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
import patch_embedding
import torchvision.transforms as transforms
import matplotlib.pyplot as plt


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_timesteps = 1000
betas = get_named_beta_schedule("linear", num_timesteps)
diffusion = GaussianDiffusion(
    betas=betas,
    model_mean_type=ModelMeanType.EPSILON,
    model_var_type=ModelVarType.FIXED_SMALL,
    loss_type=LossType.MSE
)

batch_size, channels, height, width = 8, 3, 256, 256
x_start = torch.randn((batch_size, channels, height, width), device=device)

x_start = torch.clamp(x_start, -1, 1)

t = torch.tensor([500] * batch_size, device=device, dtype=torch.long)
noise = torch.randn_like(x_start)
x_noisy = diffusion.q_sample(x_start=x_start, t=t, noise=noise)

resize_transform = transforms.Resize((224, 224))
x_noisy_resized = resize_transform(x_noisy)

embeddings = patch_embedding.process_noised_image(x_noisy_resized, img_size=224, patch_size=14, embed_dim=768)

print(f"Original image shape: {x_start.shape}")
print(f"Noisy image shape: {x_noisy.shape}")
print(f"Resized noisy image shape: {x_noisy_resized.shape}")
print(f"Sample original values (first image, first channel, top-left 2x2):\n{x_start[0, 0, :2, :2]}")
print(f"Sample noisy values (first image, first channel, top-left 2x2):\n{x_noisy_resized[0, 0, :2, :2]}")
print(f"Patch embeddings shape: {embeddings.shape}")

3D REPRESENTATION [Point Cloud, Voxel, Mesh, Implicit Fields,ETC]