In [1]:
!pip install -U git+https://github.com/Sakib323/AI-Game-Engine.git
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers
!pip install triton==3.2.0
!pip install datasets
!pip install wandb
!pip install -U datasets
!pip install triton==2.3.1

Collecting git+https://github.com/Sakib323/AI-Game-Engine.git
  Cloning https://github.com/Sakib323/AI-Game-Engine.git to /tmp/pip-req-build-m5qc3z_z
  Running command git clone --filter=blob:none --quiet https://github.com/Sakib323/AI-Game-Engine.git /tmp/pip-req-build-m5qc3z_z
  Resolved https://github.com/Sakib323/AI-Game-Engine.git to commit f84e797f9e7ed0e80cbc87f78c5edb9fdaa9ef84
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ninja (from ai_game_engine==0.1)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.0->ai_game_engine==0.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.0->ai_game_engine==0.1)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.1

In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
from diffusers import AutoencoderKL
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
from timm.models.vision_transformer import PatchEmbed
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F

# --- Config ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


image_size = 512
latent_downscale = 8
latent_size = image_size // latent_downscale  # 64
patch_size = 4
patch_dim = 4 * patch_size * patch_size      # 64
embed_dim = 1024
batch_size = 8

vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="vae").to(device).eval()
betas = get_named_beta_schedule("linear", 1000)
diffusion = GaussianDiffusion(betas=betas, model_mean_type=ModelMeanType.EPSILON, model_var_type=ModelVarType.FIXED_SMALL, loss_type=LossType.MSE)
patch_embed = PatchEmbed(img_size=latent_size, patch_size=patch_size, in_chans=4, embed_dim=embed_dim).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token


transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])



def preprocess(example):
    # Image -> latent
    image = transform(example['image'].convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        latents = vae.encode(image).latent_dist.sample() * vae.config.scaling_factor
        latents = F.interpolate(latents, size=(latent_size, latent_size), mode='bilinear')
    # Add noise
    t = torch.randint(0, 1000, (1,), device=device).long()
    noise = torch.randn_like(latents)
    x_noisy = diffusion.q_sample(x_start=latents, t=t, noise=noise)
    # Flatten patches via unfold: [1,4,64,64] -> [1,256,64]
    patches = F.unfold(x_noisy, kernel_size=patch_size, stride=patch_size)  # [1,64,256]
    patches = patches.permute(0, 2, 1).squeeze(0)  # [256,64]
    # Tokenize text
    tokens = tokenizer(example['text'], padding='max_length', truncation=True,
                       max_length=2048, return_tensors='pt')
    return {
        'patch_embeddings': patches.cpu(),
        'noise': noise.squeeze(0).cpu(),
        'timestep': t.item(),
        'input_ids': tokens['input_ids'].squeeze(0),
        'attention_mask': tokens['attention_mask'].squeeze(0)
    }




dataset = load_dataset("iamkaikai/GAME-MAP-ART", split="train").select(range(10))
processed = [preprocess(ex) for ex in dataset]



# Dataset and DataLoader
class DiTDataset(torch.utils.data.Dataset):
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx]

def collate_fn(batch):
    return {
        'patch_embeddings': torch.stack([x['patch_embeddings'] for x in batch]),
        'noise': torch.stack([x['noise'] for x in batch]),
        'timestep': torch.tensor([x['timestep'] for x in batch], dtype=torch.long),
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
    }

dataloader = DataLoader(DiTDataset(processed), batch_size=batch_size,
                        shuffle=True, num_workers=4, collate_fn=collate_fn)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/318 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/67.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1101 [00:00<?, ? examples/s]



In [3]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from mmfreelm.models import (HGRNBitModel, HGRNBitConfig, TerneryDit)
from mmfreelm.modules import RMSNorm
from mmfreelm.ops.fusedbitnet import FusedBitLinear as BitLinear

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and set example values
tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token

# Dimensions must match between patch embedding and diffusion model
embed_dim = 1024  # Must match the patch_embed output dimension
latent_size = 64   # For 512px image with 8x downscale (512/8=64)
patch_size = 4

text_config = HGRNBitConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=512,
    num_hidden_layers=8,
    attn_mode="fused_recurrent",
    rotary_embeddings=True,
    condition_dim=128
)
diffusion_config = HGRNBitConfig(
    hidden_size=embed_dim,
    num_hidden_layers=8,
    attn_mode="fused_recurrent",
    rotary_embeddings=True,
    max_position_embeddings=(latent_size // patch_size) ** 2,
    condition_dim=text_config.hidden_size + embed_dim
)
model = TerneryDit(
    text_config=text_config,
    diffusion_config=diffusion_config,
    num_timesteps=1000,
    patch_dim=patch_dim
).to(device)
model.text_model.gradient_checkpointing_enable()
model.diffusion_model.gradient_checkpointing_enable()


criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scaler = torch.cuda.amp.GradScaler()
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(dataloader):
        patch_emb = batch['patch_embeddings'].to(device)
        timesteps = batch['timestep'].to(device)
        input_ids = batch['input_ids'].to(device)
        attn_mask = batch['attention_mask'].to(device)
        target_noise = batch['noise'].to(device)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast():
            pred_noise = model(
                patch_embeddings=patch_emb,
                timesteps=timesteps,
                input_ids=input_ids,
                attention_mask=attn_mask
            )
            # [B,256,64] -> reconstruct [B,4,64,64]
            B, N, C = pred_noise.shape
            pred_noise = pred_noise.permute(0, 2, 1).reshape(B, C, -1)
            pred_noise = F.fold(pred_noise, output_size=(latent_size, latent_size),
                                kernel_size=patch_size, stride=patch_size)
            loss = criterion(pred_noise, target_noise)
        scaler.scale(loss).backward()
        # Gradient accumulation / step
        if (i + 1) % 4 == 0 or i == len(dataloader) - 1:
            scaler.step(optimizer)
            scaler.update()
            torch.cuda.empty_cache()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding] Initialized with: dim=512, max_pos=2048, base=10000.0, ternary=True

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding] Initialized with: dim=512, max_pos=2048, base=10000.0, ternary=True

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding] Initialized with: dim=512, max_pos=2048, base=10000.0, ternary=True

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding] Initialized with: dim=512, max_pos=2048, base=10000.0, ternary=True

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding] Initialized with: dim=512, max_pos=2048, base=10000.0, ternary=True

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding] Initialized with: dim=512, max_pos=2048, base=10000.0, ternary=True

Initializing RotaryEmbedding with theta=10000.0 and ternary=True

[RotaryEmbedding

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


AttributeError: 'tuple' object has no attribute 'shape'

Additional Snippet

In [None]:
import torch as th
import numpy as np
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
from diffusers import AutoencoderKL
import matplotlib.pyplot as plt

device = "cuda" if th.cuda.is_available() else "cpu"
vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="vae").to(device)
vae.eval()

num_timesteps = 1000
betas = get_named_beta_schedule("linear", num_timesteps)
diffusion = GaussianDiffusion(
    betas=betas,
    model_mean_type=ModelMeanType.EPSILON,
    model_var_type=ModelVarType.FIXED_SMALL,
    loss_type=LossType.MSE
)

batch_size, channels, height, width = 8, 3, 256, 256
x_start = th.randn((batch_size, channels, height, width), device=device)
x_start = th.clamp(x_start, -1, 1)
with th.no_grad():
    latent_dist = vae.encode(x_start).latent_dist
    latent = latent_dist.sample() * vae.config.scaling_factor
t = th.tensor([500] * batch_size, device=device, dtype=th.long)
noise = th.randn_like(latent)
x_noisy = diffusion.q_sample(x_start=latent, t=t, noise=noise)


In [None]:
import torch
import numpy as np
from diffusion_model import GaussianDiffusion, ModelMeanType, ModelVarType, LossType, get_named_beta_schedule
import patch_embedding
import torchvision.transforms as transforms
import matplotlib.pyplot as plt


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_timesteps = 1000
betas = get_named_beta_schedule("linear", num_timesteps)
diffusion = GaussianDiffusion(
    betas=betas,
    model_mean_type=ModelMeanType.EPSILON,
    model_var_type=ModelVarType.FIXED_SMALL,
    loss_type=LossType.MSE
)

batch_size, channels, height, width = 8, 3, 256, 256
x_start = torch.randn((batch_size, channels, height, width), device=device)

x_start = torch.clamp(x_start, -1, 1)

t = torch.tensor([500] * batch_size, device=device, dtype=torch.long)
noise = torch.randn_like(x_start)
x_noisy = diffusion.q_sample(x_start=x_start, t=t, noise=noise)

resize_transform = transforms.Resize((224, 224))
x_noisy_resized = resize_transform(x_noisy)

embeddings = patch_embedding.process_noised_image(x_noisy_resized, img_size=224, patch_size=14, embed_dim=768)

print(f"Original image shape: {x_start.shape}")
print(f"Noisy image shape: {x_noisy.shape}")
print(f"Resized noisy image shape: {x_noisy_resized.shape}")
print(f"Sample original values (first image, first channel, top-left 2x2):\n{x_start[0, 0, :2, :2]}")
print(f"Sample noisy values (first image, first channel, top-left 2x2):\n{x_noisy_resized[0, 0, :2, :2]}")
print(f"Patch embeddings shape: {embeddings.shape}")

3D REPRESENTATION [Point Cloud, Voxel, Mesh, Implicit Fields,ETC]