## **Small Dataset with sakib323/matmulfreellm (with rotary embeeding + MoE)**

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install -U git+https://github.com/Sakib323/AI-Game-Engine.git
!pip install transformers
!pip install triton==3.2.0
!pip install datasets
!pip install wandb

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m109.6 MB/s[0m eta [36

In [None]:
import os
import torch
import wandb
from datasets import load_dataset, Dataset
from transformers import (Trainer, TrainingArguments, DataCollatorForLanguageModeling,AutoTokenizer)
from mmfreelm.models import ( HGRNBitForCausalLM,HGRNBitModel, HGRNBitConfig)
import triton


WANDB_TOKEN = "89b06c10468af620747b4bd340f72fa5d56f6849"
wandb.login(key=WANDB_TOKEN)
os.environ["WANDB_PROJECT"] = "mesh-dit-3d-generation"


tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token

streamed_dataset = load_dataset("HuggingFaceFW/fineweb", name="default", split="train", streaming=True)

demo_data = Dataset.from_list(list(streamed_dataset.take(10000)))

def tokenize_function(examples):
    combined = [t + tokenizer.eos_token for t in examples["text"]]
    tokenized = tokenizer(
        combined,
        truncation=True,
        max_length=1024,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized



tokenized_dataset = demo_data.map(tokenize_function,batched=True,remove_columns=["text"])

#tokenized_dataset = streamed_dataset.map(tokenize_function, batched=True)
split_datasets = tokenized_dataset.train_test_split(test_size=0.1)


config = HGRNBitConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=1024,
    num_hidden_layers=24,
    max_position_embeddings=2048,
    attn_mode="fused_recurrent",
    use_short_conv=False,
    conv_size=4,
    rms_norm_eps=1e-6,
    pad_token_id=tokenizer.pad_token_id,
    rope_theta=10000.0,
    use_ternary_rope=False,
    rotary_embeddings=False,
    moe=True,
    num_experts=2,
    num_experts_per_tok=2,
    moe_intermediate_size=1024,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HGRNBitForCausalLM(config).to(device)


#state_dict_p1 = safetensors_load("/kaggle/input/meshdit-trained-model/model.safetensors", device="cpu")
#model.load_state_dict(state_dict_p1)
print(model)

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    eval_strategy="steps",          
    eval_steps=100,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=4e-3,
    weight_decay=0.00,
    logging_steps=100,
    save_steps=1000,
    fp16=False,
    run_name="HGRNBit-MMfreeLM-370M-with-rotary-embedding",
    report_to="wandb",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
    data_collator=data_collator,
)

trainer.train()


## **VRAM 48GB GPU WITH 3B PARAM**

In [None]:
from huggingface_hub import create_repo
token = "" # Your token
create_repo("Sakib323/TerLM3B", token=token, private=True, exist_ok=True)
print("Repository confirmed!")

Repository confirmed!


In [None]:
import os
import json
import torch
import wandb
import gc
import multiprocessing
import shutil
from pathlib import Path
from datasets import load_dataset
from transformers import (
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling,
    AutoTokenizer
)
from huggingface_hub import login, HfApi, hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from mmfreelm.models import HGRNBitForCausalLM, HGRNBitConfig
import psutil

HF_TOKEN = "" 
WANDB_TOKEN = "89b06c10468af620747b4bd340f72fa5d56f6849"
REPO_ID = "Sakib323/TerLM3B" 
LOCAL_MODEL_DIR = "./local_model_storage"

# Login
login(token=HF_TOKEN)
wandb.login(key=WANDB_TOKEN)

os.environ["WANDB_PROJECT"] = "HGRNBit-3B-Pretrain"
os.environ["WANDB_WATCH"] = "gradients"


# --- ADD THIS HELPER FUNCTION ---
def print_memory_stats(step_name):
    if torch.cuda.is_available():
        # VRAM
        allocated = torch.cuda.memory_allocated() / (1024 ** 3)
        reserved = torch.cuda.memory_reserved() / (1024 ** 3)
        print(f"\n[MEMORY - {step_name}]")
        print(f"  > VRAM (Tensor Data): {allocated:.2f} GB")
        print(f"  > VRAM (Reserved/Cache): {reserved:.2f} GB")
    
    # SYSTEM RAM
    ram = psutil.virtual_memory()
    print(f"  > Sys RAM Used: {ram.used / (1024 ** 3):.2f} / {ram.total / (1024 ** 3):.2f} GB")
    print("-" * 30 + "\n")



class DataFileManager:
    def __init__(self, repo_id, local_state_file="training_state.json"):
        self.repo_id = repo_id
        self.local_state_file = local_state_file
        self.api = HfApi()
        self.fineweb_repo = "HuggingFaceFW/fineweb"

    def get_all_subsets(self):
        """Lists all subset folders in FineWeb (excluding default)."""
        print(">>> Fetching global subset list from Hugging Face...")
        files = self.api.list_repo_files(self.fineweb_repo, repo_type="dataset")
        subsets = set()
        for f in files:
            if f.startswith("data/") and "/" in f[5:]:
                subset_name = f.split("/")[1]
                # Filter for CC-MAIN to avoid aggregate files
                if "CC-MAIN" in subset_name: 
                    subsets.add(subset_name)
        return sorted(list(subsets))

    def get_parquet_files(self, subset):
        """Lists all .parquet files for a specific subset."""
        prefix = f"data/{subset}/"
        files = self.api.list_repo_files(self.fineweb_repo, repo_type="dataset")
        parquet_files = [f for f in files if f.startswith(prefix) and f.endswith(".parquet")]
        return sorted(parquet_files)

    def load_state(self):
        """Downloads state from Hub or initializes fresh."""
        try:
            print(">>> Attempting to download training state from Hub...")
            path = hf_hub_download(repo_id=self.repo_id, filename=self.local_state_file)
            with open(path, 'r') as f:
                state = json.load(f)
            print(f"\n[STATE] FOUND! Resuming: {state['current_subset']} | File Index: {state['file_index']}")
            return state
        except (EntryNotFoundError, Exception):
            print("\n[STATE] No remote state found. Initializing FRESH run.")
            subsets = self.get_all_subsets()
            return {
                "subsets_queue": subsets,
                "current_subset_index": 0,
                "current_subset": subsets[0],
                "file_index": 0,             
                "total_steps_trained": 0,
                "wandb_run_id": None # To resume wandb charts
            }

    def save_state(self, state):
        """Saves state locally and pushes to Hub immediately."""
        with open(self.local_state_file, 'w') as f:
            json.dump(state, f, indent=2)
        try:
            self.api.upload_file(
                path_or_fileobj=self.local_state_file,
                path_in_repo=self.local_state_file,
                repo_id=self.repo_id,
                repo_type="model",
                commit_message=f"Update state: {state['current_subset']} file {state['file_index']}"
            )
            print("[STATE] Synced to Hub.")
        except Exception as e:
            print(f"[STATE] Sync failed (Network issue?): {e}")

# Load Tokenizer once
tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token

config = HGRNBitConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=2560,        # 2560 width
    num_hidden_layers=32,    
    num_heads=20,            # 128 dim per head
    attn_mode="fused_recurrent",
    use_short_conv=False,
    rms_norm_eps=1e-5,
    max_position_embeddings=4096, 
    moe=True,
    num_experts=8,            
    num_experts_per_tok=2,    
    moe_intermediate_size=8960, 
    pad_token_id=tokenizer.pad_token_id,
    initializer_range=0.02,
)

def get_model():
    try:
        print(">>> Attempting to load latest model checkpoint from Hub...")

        model = HGRNBitForCausalLM.from_pretrained(REPO_ID, trust_remote_code=True)
        print(">>> SUCCESS: Loaded pre-trained weights.")
    except Exception as e:
        print(f">>> WARNING: Could not load weights ({e}). Initializing RANDOM weights.")
        model = HGRNBitForCausalLM(config)
    return model

def train_one_file():
    # 1. State Management
    manager = DataFileManager(REPO_ID)
    state = manager.load_state()
    
    if state["current_subset_index"] >= len(state["subsets_queue"]):
        print("Training Complete! All subsets finished.")
        return False

    current_subset = state["current_subset"]
    file_index = state["file_index"]
    
    # 2. Get File List
    print(f"\n>>> Fetching file list for {current_subset}...")
    parquet_files = manager.get_parquet_files(current_subset)
    
    # Check if subset is done
    if file_index >= len(parquet_files):
        print(f">>> Subset {current_subset} finished! Moving to next...")
        state["current_subset_index"] += 1
        state["current_subset"] = state["subsets_queue"][state["current_subset_index"]]
        state["file_index"] = 0
        manager.save_state(state)
        return True 
    

if __name__ == "__main__":
    print(">>> Starting Continuous Pre-training Manager...")
    while True:
        try:
            should_continue = train_one_file()
            if not should_continue:
                break
        except Exception as e:
            print(f"CRITICAL LOOP ERROR: {e}")
            print(">>> Restarting loop in 10 seconds...")
            import time
            time.sleep(10)

2025-12-04 19:37:23.281186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764877043.474719      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764877043.531212      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msakibahmed2018go[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

>>> Starting Continuous Pre-training Manager...
>>> Attempting to download training state from Hub...

[STATE] No remote state found. Initializing FRESH run.
>>> Fetching global subset list from Hugging Face...

>>> Fetching file list for CC-MAIN-2013-20...

 TRAINING ON: CC-MAIN-2013-20 | File 1/205
 data/CC-MAIN-2013-20/000_00000.parquet



data/CC-MAIN-2013-20/000_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Tokenizing:   0%|          | 0/1091396 [00:00<?, ? examples/s]

>>> Attempting to load latest model checkpoint from Hub...


In [None]:
from mmfreelm.models import HGRNBitForCausalLM
import torch
from transformers import AutoTokenizer

model = HGRNBitForCausalLM.from_pretrained("Sakib323/ternary-language-model")
model.to("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("Sakib323/MMfreeLM-370M")
tokenizer.pad_token = tokenizer.eos_token


In [None]:

def generate_text(prompt, max_new_tokens=2000):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.convert_tokens_to_ids("</s>"),
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "How AP reported in all formats from tornado-stricken regionsMarch 8, 2012"
generated_text = generate_text(prompt)
print(generated_text)
