## import

In [30]:
import torch
import torch.nn as nn
from torch.nn import functional as f
torch.manual_seed(42)

<torch._C.Generator at 0x7f8364a20f50>

## Load The Data

In [31]:
# we will download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/AviSoori1x/makeMoE/main/input.txt


--2025-10-20 04:37:55--  https://raw.githubusercontent.com/AviSoori1x/makeMoE/main/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-10-20 04:37:55 (31.6 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



## Define Experts

In [32]:
class Expert(nn.Module):
  def __init__(self,n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.GELU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout),
    )


  def forward(self,x):
    return self.net(x)

## Impliment The Router

In [33]:
num_expert=4
top_k=3
n_embed = 32


# Example
mh_output = torch.rand(2,4,n_embed)
topkgate_linear = nn.Linear(n_embed, num_expert)  # 32 x 4
logist = topkgate_linear(mh_output)

print(logist)

tensor([[[-0.2417, -0.1422, -0.8395,  0.0696],
         [-0.0817, -0.0317, -0.8352, -0.0116],
         [-0.0519,  0.2970, -0.9370, -0.1549],
         [-0.0545, -0.0121, -0.6304, -0.1077]],

        [[-0.0450,  0.3167, -0.3820,  0.0872],
         [-0.0316,  0.1769, -1.0049,  0.2885],
         [ 0.0063,  0.3378, -0.6606,  0.1216],
         [-0.0663, -0.0898, -0.5614, -0.1854]]], grad_fn=<ViewBackward0>)


## Load Balancing

In [34]:
topk_logist, topk_indices = logist.topk(top_k,dim=1)
topk_logist, topk_indices

(tensor([[[-0.0519,  0.2970, -0.6304,  0.0696],
          [-0.0545, -0.0121, -0.8352, -0.0116],
          [-0.0817, -0.0317, -0.8395, -0.1077]],
 
         [[ 0.0063,  0.3378, -0.3820,  0.2885],
          [-0.0316,  0.3167, -0.5614,  0.1216],
          [-0.0450,  0.1769, -0.6606,  0.0872]]], grad_fn=<TopkBackward0>),
 tensor([[[2, 2, 3, 0],
          [3, 3, 1, 1],
          [1, 1, 0, 3]],
 
         [[2, 2, 0, 1],
          [1, 0, 3, 2],
          [0, 1, 2, 0]]]))

## -infinity And Apply Softmax

In [35]:
from math import inf
zeros = torch.full_like(logist,float('-inf'))
sparse_logist = zeros.scatter(-1, topk_indices, topk_logist)
sparse_logist

tensor([[[ 0.0696,    -inf,  0.2970, -0.6304],
         [   -inf, -0.0116,    -inf, -0.0121],
         [-0.8395, -0.0317,    -inf, -0.1077],
         [   -inf,    -inf,    -inf,    -inf]],

        [[-0.3820,  0.2885,  0.3378,    -inf],
         [ 0.3167, -0.0316,  0.1216, -0.5614],
         [ 0.0872,  0.1769, -0.6606,    -inf],
         [   -inf,    -inf,    -inf,    -inf]]], grad_fn=<ScatterBackward0>)

In [36]:
# inplace of inf we are putting zeros
getting_output = f.softmax(sparse_logist, dim=1)
getting_output

tensor([[[0.7128, 0.0000, 1.0000, 0.2201],
         [0.0000, 0.5050, 0.0000, 0.4086],
         [0.2872, 0.4950, 0.0000, 0.3713],
         [0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.2169, 0.3816, 0.4600, 0.0000],
         [0.4363, 0.2771, 0.3706, 1.0000],
         [0.3468, 0.3413, 0.1695, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000]]], grad_fn=<SoftmaxBackward0>)

## Class for Topk Routing

In [37]:
# First define the top k router module
class TopkRouter(nn.Module):
    def __init__(self, n_embed, num_experts, top_k):
        super(TopkRouter, self).__init__()
        self.top_k = top_k
        self.linear = nn.Linear(n_embed, num_experts)

    def forward(self, mh_output):
        # mh_output is the output tensor from multihead self-attention block
        logits = self.linear(mh_output)
        top_k_logits, indices = logits.topk(self.top_k, dim=-1)
        zeros = torch.full_like(logits, float('-inf'))
        sparse_logits = zeros.scatter(-1, indices, top_k_logits)
        router_output = f.softmax(sparse_logits, dim=-1)
        return router_output, indices


In [38]:
num_experts=3
top_k=2
n_embd=8

# Example
mh_output = torch.rand(1,4,n_embed)
top_k_gate =TopkRouter(n_embed, num_expert, top_k)
getting_output, indices = top_k_gate(mh_output)

getting_output.shape, getting_output, indices

(torch.Size([1, 4, 4]),
 tensor([[[0.0000, 0.5049, 0.4951, 0.0000],
          [0.4959, 0.0000, 0.5041, 0.0000],
          [0.6062, 0.0000, 0.3938, 0.0000],
          [0.5327, 0.0000, 0.4673, 0.0000]]], grad_fn=<SoftmaxBackward0>),
 tensor([[[1, 2],
          [2, 0],
          [0, 2],
          [0, 2]]]))

## Noisy Top K

In [39]:
class NoisyTopkRouter(nn.Module):
    def __init__(self, n_embed, num_experts, top_k):
        super(NoisyTopkRouter, self).__init__()
        self.top_k = top_k

        # layer for router logits
        self.topkroute_linear = nn.Linear(n_embed, num_experts)
        self.noise_linear = nn.Linear(n_embed, num_experts)

    def forward(self, mh_output):
        # mh_output is the output tensor from multihead self attention block
        logits = self.topkroute_linear(mh_output)

        # Noise logits
        noise_logits = self.noise_linear(mh_output)

        # Adding scaled unit Gaussian noise to the logits
        noise = torch.randn_like(logits) * f.softplus(noise_logits)
        noisy_logits = logits + noise

        top_k_logits, indices = noisy_logits.topk(self.top_k, dim=-1)
        zeros = torch.full_like(noisy_logits, float('-inf'))
        sparse_logits = zeros.scatter(-1, indices, top_k_logits)
        router_output = f.softmax(sparse_logits, dim=-1)
        return router_output, indices


In [40]:
# Testing this out, again:
num_experts = 3
top_k = 2
n_embd = 8

mh_output = torch.randn(1, 4, n_embd)  # Example input
noisy_top_k_gate = NoisyTopkRouter(n_embd, num_experts, top_k)
gating_output, indices = noisy_top_k_gate(mh_output)

gating_output.shape, gating_output, indices
# ✅ It works!!


(torch.Size([1, 4, 3]),
 tensor([[[0.0000, 0.6529, 0.3471],
          [0.1302, 0.0000, 0.8698],
          [0.4382, 0.0000, 0.5618],
          [0.5730, 0.4270, 0.0000]]], grad_fn=<SoftmaxBackward0>),
 tensor([[[1, 2],
          [2, 0],
          [2, 0],
          [0, 1]]]))

In [41]:
class SparseMoE(nn.Module):
    def __init__(self, n_embed, num_experts, top_k):
        super(SparseMoE, self).__init__()
        self.router = NoisyTopkRouter(n_embed, num_experts, top_k)
        self.experts = nn.ModuleList([Expert(n_embed) for _ in range(num_experts)])
        self.top_k = top_k

    def forward(self, x):
        gating_output, indices = self.router(x)
        final_output = torch.zeros_like(x)

        # Reshape inputs for batch processing
        flat_x = x.view(-1, x.size(-1))
        flat_gating_output = gating_output.view(-1, gating_output.size(-1))

        # Process each expert in parallel
        for i, expert in enumerate(self.experts):
            # Create a mask for the inputs where the current expert is in top-k
            expert_mask = (indices == i).any(dim=-1)
            flat_mask = expert_mask.view(-1)

            if flat_mask.any():
                expert_input = flat_x[flat_mask]
                expert_output = expert(expert_input)

                # Extract and apply gating scores
                gating_scores = flat_gating_output[flat_mask, i].unsqueeze(1)
                weighted_output = expert_output * gating_scores

                # Update final output additively by indexing and adding
                final_output[expert_mask] += weighted_output.squeeze(1)

        return final_output


In [42]:
# Let's test this out
num_experts = 3
top_k = 2
n_embd = 8
dropout = 0.1

mh_output = torch.randn(1, 4, n_embd)  # Example multi-head attention output
sparse_moe = SparseMoE(n_embd, num_experts, top_k)
final_output = sparse_moe(mh_output)

print("Shape of the final output:", final_output.shape)
print(final_output)


Shape of the final output: torch.Size([1, 4, 8])
tensor([[[ 0.0376, -0.0033, -0.0288,  0.0254,  0.3026,  0.3510, -0.1571,
          -0.2406],
         [ 0.1961, -0.0018, -0.1411, -0.0173,  0.2917, -0.0550, -0.0507,
          -0.0425],
         [-0.0573, -0.0112,  0.0427, -0.0455,  0.0413,  0.1064, -0.1094,
          -0.0904],
         [-0.0945, -0.1488, -0.0067, -0.1033, -0.1375,  0.0000,  0.0921,
          -0.2591]]], grad_fn=<IndexPutBackward0>)


## Putting all together

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class NoisyTopkRouter(nn.Module):
    def __init__(self, n_embed, num_experts, top_k):
        super(NoisyTopkRouter, self).__init__()
        self.top_k = top_k
        # layer for router logits
        self.topkroute_linear = nn.Linear(n_embed, num_experts)
        self.noise_linear = nn.Linear(n_embed, num_experts)

    def forward(self, mh_output):
        # mh_output is the output tensor from multihead self attention block
        logits = self.topkroute_linear(mh_output)

        # Noise logits
        noise_logits = self.noise_linear(mh_output)

        # Adding scaled unit Gaussian noise to the logits
        noise = torch.randn_like(logits) * F.softplus(noise_logits)
        noisy_logits = logits + noise

        top_k_logits, indices = noisy_logits.topk(self.top_k, dim=-1)
        zeros = torch.full_like(noisy_logits, float('-inf'))
        sparse_logits = zeros.scatter(-1, indices, top_k_logits)
        router_output = F.softmax(sparse_logits, dim=-1)
        return router_output, indices



class Expert(nn.Module):
    def __init__(self, n_embed):
        super(Expert, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed),
            nn.ReLU(),
            nn.Linear(n_embed, n_embed)
        )

    def forward(self, x):
        return self.net(x)


class SparseMoE(nn.Module):
    def __init__(self, n_embed, num_experts, top_k):
        super(SparseMoE, self).__init__()
        self.router = NoisyTopkRouter(n_embed, num_experts, top_k)
        self.experts = nn.ModuleList([Expert(n_embed) for _ in range(num_experts)])
        self.top_k = top_k

    def forward(self, x):
        gating_output, indices = self.router(x)
        final_output = torch.zeros_like(x)

        # Reshape inputs for batch processing
        flat_x = x.view(-1, x.size(-1))
        flat_gating_output = gating_output.view(-1, gating_output.size(-1))

        # Process each expert in parallel
        for i, expert in enumerate(self.experts):
            # Create a mask for the inputs where the current expert is in top-k
            expert_mask = (indices == i).any(dim=-1)
            flat_mask = expert_mask.view(-1)

            if flat_mask.any():
                expert_input = flat_x[flat_mask]
                expert_output = expert(expert_input)

                # Extract and apply gating scores
                gating_scores = flat_gating_output[flat_mask, i].unsqueeze(1)
                weighted_output = expert_output * gating_scores

                # Update final output additively by indexing and adding
                final_output[expert_mask] += weighted_output.squeeze(1)

        return final_output

Shape of the final output: torch.Size([1, 4, 8])
tensor([[[-0.3353,  0.1319, -0.3851,  0.2569, -0.4109, -0.3995,  0.2535,
          -0.0176],
         [-0.2706, -0.0220,  0.0683,  0.1375, -0.1583, -0.3143, -0.0646,
          -0.0106],
         [-0.1619, -0.1402, -0.3502,  0.2790, -0.0967,  0.0784, -0.2770,
           0.2086],
         [-0.2057, -0.0115, -0.1763,  0.3149, -0.0429, -0.1158, -0.0741,
          -0.0524]]], grad_fn=<IndexPutBackward0>)


In [44]:
num_experts = 3
top_k = 2
n_embd = 8
dropout = 0.1

mh_output = torch.randn(1, 4, n_embd)  # Example multi-head attention output
sparse_moe = SparseMoE(n_embd, num_experts, top_k)
final_output = sparse_moe(mh_output)

print("Shape of the final output:", final_output.shape)
print(final_output)


Shape of the final output: torch.Size([1, 4, 8])
tensor([[[ 0.1626,  0.0525,  0.2134,  0.3373, -0.0790,  0.0268, -0.2781,
          -0.1298],
         [ 0.2976, -0.0684,  0.0317, -0.0590, -0.0747,  0.2387, -0.1330,
           0.0526],
         [ 0.0297,  0.0705,  0.0088,  0.1307, -0.0159,  0.2565, -0.1354,
          -0.1547],
         [ 0.2979, -0.0690, -0.0143, -0.3176, -0.4144,  0.5400,  0.0038,
          -0.1146]]], grad_fn=<IndexPutBackward0>)


In [45]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, n_embed, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)      # (B, T, C)
        q = self.query(x)    # (B, T, C)

        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5      # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)                 # (B, T, T)
        wei = self.dropout(wei)

        # perform the weighted aggregation of the values
        v = self.value(x)                            # (B, T, C)
        out = wei @ v                                # (B, T, C)
        return out


In [46]:
# Multi-Headed Self Attention
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, n_embed, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(n_embed, head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


## Transformer Block

In [47]:
class Block(nn.Module):
    """ Mixture of Experts Transformer block: communication followed by computation (multi-head self attention) """

    def __init__(self, n_embed, n_head, num_experts, top_k):
        # n_embed: embedding dimension, n_head: number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_embed, n_head, head_size)
        self.smoe = SparseMoE(n_embed, num_experts, top_k)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.smoe(self.ln2(x))
        return x


## Model Arch

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# assume these globals are defined somewhere in your script
# vocab_size, n_embed, block_size, n_head, n_layer, num_experts, top_k, device = ...
# and classes: Block (uses MultiHeadAttention + SparseMoE)

class SparseMoELanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table   = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(
            *[Block(n_embed, n_head, num_experts=num_experts, top_k=top_k)
              for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embed)            # final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)                         # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb                                             # (B,T,C)
        x = self.blocks(x)                                                # (B,T,C)
        x = self.ln_f(x)                                                  # (B,T,C)
        logits = self.lm_head(x)                                          # (B,T,V)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        """
        Autoregressively sample next tokens.
        idx: LongTensor of shape (B, T)
        """
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]                 # (B, T_ctx)
            logits, _ = self(idx_cond)                      # (B, T_ctx, V)
            logits = logits[:, -1, :]                       # (B, V)
            probs = F.softmax(logits, dim=-1)               # (B, V)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat([idx, idx_next], dim=1)         # (B, T+1)
        return idx


## Trainig and testing data

In [49]:
import torch

torch.manual_seed(1337)

# Read the dataset
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# --- Create character-level vocabulary ---
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size:", vocab_size)

# Mappings (char ↔ index)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]                # string → list of ints
decode = lambda l: ''.join([itos[i] for i in l])       # list of ints → string

# --- Train / Validation Split ---
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))  # first 90% train, rest validation
train_data = data[:n]
val_data = data[n:]

# --- Dataloader Function ---
def get_batch(split):
    """Generate a small batch of data for inputs (x) and targets (y)."""
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


Vocab size: 65


## Define LLM loss

In [50]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


## Training loop pararms and hyper params

In [51]:
# Step 14: Define training loop parameters and other hyperparameters
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

# ----------------
# Hyperparameters
# ----------------
batch_size = 16          # how many independent sequences will we process in parallel?
block_size = 32          # what is the maximum context length for predictions?
max_iters = 200          # total training iterations  increase this if you want accurate result 60k 100k
eval_interval = 100      # evaluate the model every N steps
learning_rate = 1e-3     # optimizer learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 400         # number of iterations for evaluation
head_size = 16
n_embed = 128
n_head = 8
n_layer = 8
dropout = 0.1
num_experts = 8
top_k = 2


## Init Model

In [52]:
def kaiming_init_weights(m):
    if isinstance(m, nn.Linear):
        init.kaiming_normal_(m.weight)

model = SparseMoELanguageModel()
model.apply(kaiming_init_weights)


SparseMoELanguageModel(
  (token_embedding_table): Embedding(65, 128)
  (position_embedding_table): Embedding(32, 128)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (key): Linear(in_features=128, out_features=16, bias=False)
            (query): Linear(in_features=128, out_features=16, bias=False)
            (value): Linear(in_features=128, out_features=16, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (smoe): SparseMoE(
        (router): NoisyTopkRouter(
          (topkroute_linear): Linear(in_features=128, out_features=8, bias=True)
          (noise_linear): Linear(in_features=128, out_features=8, bias=True)
        )
        (experts): ModuleList(
          (0-7): 8 x Expert(
            (net): Sequential(
             

## Run Loop

In [53]:
# Step 16: Run the pre-training loop

# move model to device
m = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


2.680513 M parameters
step 0: train loss 5.2074, val loss 5.2143
step 100: train loss 2.7442, val loss 2.7565
step 199: train loss 2.5168, val loss 2.5133


## Infrance


In [54]:
# Step 17: Inference

# generate from the model. Not great. Not too bad either
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))



 tanth moreve thst uredazint I heer or amanee sts the
Or shtilord t.
ano hee:
Se herxher porste thenk shore pan ar wegerece.

Ther hamy tis,
CAGENAwdov:
DETUNERE:
WMIOur  s ham:
IGpUC:
DN:
Fot slan nkio y sthereixe t durst-parcattluut,
Anes mevil musers torsheromanor mes ghy t;:
Areotow gores ye fovisu r, hae ssirghof.

Sowes st thesthafipree w :
We fhe thure myoofond y fuavat y:
A:
fout y, theyt haaC, un theutiny thereinof uren ge t thanoour pu oumee.
Fevey re:
Wanoy wzuavere hiceng ard prseroof te Eru lmt pu mupe men lanerenous fopr hane wsterser h menof're zhan
IN sino mer walsthas t Githe str;

ITiUForur ms t BowofEseep wink avesow atr f te, lle inin wnger,
Pmincos thavanksu tbeant totCred,

By as A:
OAn wureearyode allancoyo bory th,
Aser torene'n, tupusrson manven n oraknd wis m'earar chaus y'd-han t wown be, here sourd,
Whocororfaw my phee, erese prire,
Therandinch'le bente ay ow thin isf h manan kst setof it m:
Clisowecir ther ntheeut tstherouff t t kind wad
Bu bur s ay wo Hur

---

## 🧪 Notes & Recommendations

This project is built for **experimentation and research demonstration**.  
If you want **higher accuracy or more stable results**, try the following:

- 🔁 **Increase training iterations** — Run `max_iters` between **60K–100K** if you have a **high-end GPU (RTX 4090 / A100 / H100)**.  
- ⚙️ **Tune hyperparameters** — Experiment with `learning_rate`, `num_experts`, `top_k`, and `n_embed` for your dataset scale.
- 💾 **Save and share your trained weights** — Push the best-performing checkpoints to **[Hugging Face Hub](https://huggingface.co)** for community use.
- 💻 **Deploy interactively** — Create a **Gradio** or **Streamlit** web interface to chat or generate text directly from the model.

---

✅ *This notebook is a complete end-to-end implementation — from dataset to inference.*  
💡 *If you enjoyed this project or found it useful, please consider giving it an ⭐ on GitHub or an upvote on Kaggle!*

**Thank you!**
