## **Small Dataset with sakib323/matmulfreellm (with rotary embeeding + MoE)**

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install -U git+https://github.com/Sakib323/AI-Game-Engine.git
!pip install transformers
!pip install triton==3.2.0
!pip install datasets
!pip install wandb

In [None]:
import os
import torch
import wandb
from datasets import load_dataset
from transformers import (
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    AutoTokenizer
)
from mmfreelm.models import ( HGRNBitForCausalLM,HGRNBitModel, HGRNBitConfig)
import triton


WANDB_TOKEN = "89b06c10468af620747b4bd340f72fa5d56f6849"
wandb.login(key=WANDB_TOKEN)
os.environ["WANDB_PROJECT"] = "mesh-dit-3d-generation"


tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token


demo_data = load_dataset("meta-math/MetaMathQA")

def tokenize_function(examples):
    combined = [
        f"Query: {q}\nOriginal Question: {o}\nResponse: {r}{tokenizer.eos_token}"
        for q, o, r in zip(
            examples["query"], examples["original_question"], examples["response"]
        )
    ]
    tokenized = tokenizer(
        combined,
        truncation=True,
        max_length=1024,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

demo_data_small = demo_data["train"].select(range(300000))
tokenized_dataset = demo_data_small.map(
    tokenize_function,
    batched=True,
    remove_columns=["query", "original_question", "response"]
)

split_datasets = tokenized_dataset.train_test_split(test_size=0.1)


config = HGRNBitConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=1024,
    num_hidden_layers=24,
    max_position_embeddings=2048,
    attn_mode="fused_recurrent",
    use_short_conv=False,
    conv_size=4,
    rms_norm_eps=1e-6,
    pad_token_id=tokenizer.pad_token_id,
    rope_theta=10000.0,
    use_ternary_rope=False,
    rotary_embeddings=False,
    moe=True,
    num_experts=2,
    num_experts_per_tok=2,
    moe_intermediate_size=1024,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HGRNBitForCausalLM(config).to(device)
print(model)

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=3,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=4e-3,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=1000,
    fp16=False,
    run_name="HGRNBit-MMfreeLM-370M-with-rotary-embedding",
    report_to="wandb",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
    data_collator=data_collator,
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msakibahmed2018go[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

HGRNBitForCausalLM(
  (model): HGRNBitModel(
    (embeddings): Embedding(32000, 1024, padding_idx=2)
    (layers): ModuleList(
      (0-23): 24 x HGRNBitBlock(
        (attn_norm): RMSNorm(1024, eps=1e-06)
        (attn): HGRNBitAttention(
          (i_proj): FusedBitLinear(
            in_features=1024, out_features=1024, bias=False
            (norm): RMSNorm(1024, eps=1e-08)
          )
          (f_proj): FusedBitLinear(
            in_features=1024, out_features=1024, bias=False
            (norm): RMSNorm(1024, eps=1e-08)
          )
          (g_proj): FusedBitLinear(
            in_features=1024, out_features=1024, bias=False
            (norm): RMSNorm(1024, eps=1e-08)
          )
          (g_norm): FusedRMSNormSwishGate()
          (o_proj): FusedBitLinear(
            in_features=1024, out_features=1024, bias=False
            (norm): RMSNorm(1024, eps=1e-08)
          )
        )
        (mlp_norm): RMSNorm(1024, eps=1e-06)
        (mlp): HGRNBitMoE(
          (experts): M

Step,Training Loss


In [None]:
from huggingface_hub import login
login(token="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")


model.save_pretrained("MMfreeLM-370M")
tokenizer.save_pretrained("MMfreeLM-370M")


from huggingface_hub import HfApi, HfFolder
from transformers import AutoModelForCausalLM, AutoTokenizer

from huggingface_hub import create_repo
create_repo("MMfreeLM-370M", private=False)

model.push_to_hub("MMfreeLM-370M")
tokenizer.push_to_hub("MMfreeLM-370M")

In [None]:
from mmfreelm.models import HGRNBitForCausalLM
import torch

model = HGRNBitForCausalLM.from_pretrained("Sakib323/MMfreeLM-370M")
model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def generate_text(prompt, max_new_tokens=2000):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.convert_tokens_to_ids("</s>"),
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example prompt
prompt = "write code to create an addition function in python"
generated_text = generate_text(prompt)
print(generated_text)
