In [1]:
# Progress tracking setup
import time
import json
import torch
from tqdm.notebook import tqdm

def track_time(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"Operation completed in {end - start:.2f} seconds")
        return result
    return wrapper

In [2]:
%%time
# Install Git LFS
!apt-get install git-lfs
!git lfs install

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Git LFS initialized.
CPU times: user 47.7 ms, sys: 6.52 ms, total: 54.2 ms
Wall time: 3.53 s


In [3]:
%%time
# Verify GPU availability and requirements
!nvidia-smi

import torch
import gc

# Memory optimization for T4
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()

gpu_name = torch.cuda.get_device_name(0)
print(f"Using GPU: {gpu_name}")

# Set memory optimization flags for T4
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

Fri Feb 14 13:23:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   53C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
%%time
# Package installation with T4 optimized versions
%pip install torch==2.1.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 \
    transformers==4.34.0 datasets accelerate huggingface_hub wandb bitsandbytes -q
%pip install deepspeed

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m501.2 kB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement transformers==4.34.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for transformers==4.34.0[0m[31m
[0mCollecting deepspeed
  Downloading deepspeed-0.16.3.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting ninja (from deepspeed)
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting nvidia-ml-py (from deepspeed)
  Downloading nvidia_ml_py-12.570.86-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->deepspeed)
  Download

In [5]:
%%time
from huggingface_hub import login, create_repo
from getpass import getpass
import wandb
import os

# Get token securely
hf_token = getpass("Enter yoReading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Git LFS initialized.
CPU times: user 47.7 ms, sys: 6.52 ms, total: 54.2 ms
Wall time: 3.53 sur Hugging Face access token: ")
login(token=hf_token)
print("Successfully logged in to Hugging Face!")

# Initialize W&B for experiment tracking
wandb.login()
print("Successfully logged in to Weights & Biases!")

Enter your Hugging Face access token: ··········
Successfully logged in to Hugging Face!


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maivishwam[0m ([33maivishwam-vishwamai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Successfully logged in to Weights & Biases!
CPU times: user 2.99 s, sys: 434 ms, total: 3.42 s
Wall time: 30.7 s


In [29]:
# Install Git LFS
!apt-get install git-lfs -y
!git lfs install

# Clone the repository
!git clone https://github.com/VishwamAI/VishwamAI.git
%cd VishwamAI

# Install the package
!pip install -e . -q


# Configure Git LFS
!git config lfs.url https://huggingface.co/kasinadhsarma/vishwamai-model.git/info/lfs
!git config lfs.pushurl https://huggingface.co/kasinadhsarma/vishwamai-model.git/info/lfs

# Set up Git LFS tracking
!git lfs track "*.bin"
!git lfs track "*.pt"
!git lfs track "*.pth"
!git lfs track "*.ckpt"
!git lfs track "*.safetensors"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.

  lfs.transfer.maxretries
  lfs.transfer.maxverifies
  lfs.transfer.maxconcurrenttransfers
  filter.lfs.clean
  filter.lfs.smudge
  filter.lfs.process
  filter.lfs.required
Updated git hooks.
Git LFS initialized.
Cloning into 'VishwamAI'...
remote: Enumerating objects: 1008, done.[K
remote: Counting objects: 100% (413/413), done.[K
remote: Compressing objects: 100% (320/320), done.[K
remote: Total 1008 (delta 162), reused 318 (delta 88), pack-reused 595 (from 2)[K
Receiving objects: 100% (1008/1008), 28.48 MiB | 16.02 MiB/s, done.
Resolving deltas: 100% (444/444), done.

  lfs.transfer.maxretries
  lfs.transfer.maxverifies
  lfs.transfer.maxconcurrenttransfers
  filter.lfs.clean
  filter.lfs.smudge
  filter.lfs.process
  filter.lfs.required

  lfs.transfer.maxretrie

In [20]:
pip install datasets bitsandbytes

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━

In [21]:
%%time
import torch
import json
from datasets import load_dataset, concatenate_datasets
from vishwamai.model_utils import load_model, get_gpu_memory
from vishwamai.model import Transformer, ModelArgs
from vishwamai.cache_augmentation import CacheConfig, DifferentiableCacheAugmentation
from vishwamai.neural_memory import ReasoningMemoryTransformer
from vishwamai.tree_of_thoughts import TreeOfThoughts
from vishwamai.reward_function import RewardConfig
from vishwamai.trainer import VishwamAIPretrainer

# T4-specific performance optimizations
import bitsandbytes as bnb
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('high')

CPU times: user 10.2 s, sys: 1.66 s, total: 11.9 s
Wall time: 17.6 s


In [22]:
@track_time
def setup_hardware():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = get_gpu_memory()
    print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")

    # T4-optimized configuration
    if 't4' in gpu_name.lower():
        variant = "7B"  # T4-optimized model
        print("Using T4-optimized configuration with 8-bit quantization")
    else:
        variant = "167B"  # Fallback configuration
        print("Using fallback configuration")

    clear_gpu_memory()
    return variant

model_variant = setup_hardware()

Using GPU: Tesla T4 (15.8 GB)
Using T4-optimized configuration with 8-bit quantization
Operation completed in 0.36 seconds


In [23]:
@track_time
def load_config():
    config_path = "./vishwamai/configs/config_optimized.json"
    with open(config_path) as f:
        config = json.load(f)

    if model_variant not in config["model_variants"]:
        print(f"Warning: Model variant '{model_variant}' not found in config, creating T4 optimized config")
        # T4-optimized configuration
        t4_config = {
            "max_batch_size": 4,
            "max_seq_len": 2048,
            "dtype": "fp8",
            "vocab_size": 32000,
            "dim": 1024,
            "inter_dim": 2816,
            "moe_inter_dim": 512,
            "n_layers": 12,
            "n_dense_layers": 1,
            "n_heads": 16,
            "n_routed_experts": 8,
            "n_shared_experts": 1,
            "n_activated_experts": 2,
            "n_expert_groups": 1,
            "n_limited_groups": 1,
            "score_func": "softmax",
            "route_scale": 1.0,
            "q_lora_rank": 0,
            "kv_lora_rank": 64,
            "qk_nope_head_dim": 64,
            "qk_rope_head_dim": 32,
            "v_head_dim": 64,
            "original_seq_len": 2048,
            "rope_theta": 10000.0,
            "rope_factor": 20,
            "beta_fast": 16,
            "beta_slow": 1,
            "mscale": 0.5,
            "use_alibi": False,  # Disable ALiBi for T4
            "use_rope_scaling": True,
            "gradient_checkpointing": True,
            "parallel_attn": True,
            "rope_condense_ratio": 1.0
        }
        return t4_config

    return config["model_variants"][model_variant]["model_config"]

# Load configuration
model_config = load_config()
print("Configuration loaded successfully.")

Operation completed in 0.00 seconds
Configuration loaded successfully.


In [24]:
# Create DeepSpeed config for T4 optimization
ds_config = {
    "fp16": {
        "enabled": True,
        "loss_scale": 0,
        "loss_scale_window": 100,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_bucket_size": 5e8,
        "reduce_bucket_size": 5e8,
        "overlap_comm": True,
        "contiguous_gradients": True,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        }
    },
    "train_batch_size": 32,
    "gradient_accumulation_steps": 16,
    "train_micro_batch_size_per_gpu": 2,
    "gradient_clipping": 0.5,
    "steps_per_print": 10,
    "wall_clock_breakdown": False
}

with open('ds_config.json', 'w') as f:
    json.dump(ds_config, f)

In [31]:
@track_time
def initialize_components():
    print("Initializing model and components...")
    clear_gpu_memory()

    # Initialize main model with 8-bit quantization for T4
    model_args = ModelArgs(**model_config)
    model = Transformer(model_args)

    # Replace LinearWrapper with current bitsandbytes 8-bit quantization
    import bitsandbytes as bnb
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            model._modules[name] = bnb.nn.Linear8bitLt(
                module.in_features,
                module.out_features,
                module.bias is not None,
                has_fp16_weights=False,
                threshold=6.0
            )
    model = model.cuda()

    # Initialize smaller cache augmentation for T4
    cache_config = CacheConfig(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"],
        max_cache_length=8192,  # Reduced cache size for T4
        dropout=0.1
    )
    cache_module = DifferentiableCacheAugmentation(cache_config).cuda()

    # Initialize memory transformer with reduced size
    memory_module = ReasoningMemoryTransformer(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"]
    ).cuda()

    # Initialize tree of thoughts with reduced beam size
    tree_module = TreeOfThoughts(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"]
    ).cuda()

    # Initialize reward config
    reward_config = RewardConfig(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"]
    )

    clear_gpu_memory()
    return model, cache_module, memory_module, tree_module, reward_config


In [26]:
from transformers import TrainingArguments

# Initialize output directory
output_dir = "./pretrain_output"
!mkdir -p $output_dir

# Configure training with T4 optimizations
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduced batch size for T4
    gradient_accumulation_steps=16,  # Increased for T4 memory constraints
    learning_rate=5e-5,  # Reduced learning rate
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=5,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    # Mixed precision training
    fp16=True,  # Use FP16 instead of BF16 for T4
    bf16=False,
    # Performance optimizations
    gradient_checkpointing=True,
    dataloader_num_workers=2,  # Reduced workers for T4
    dataloader_pin_memory=True,
    group_by_length=True,
    # Memory optimizations
    max_grad_norm=0.5,  # Reduced for stability
    # Monitoring
    report_to=["tensorboard", "wandb"],
    # Hub integration
    push_to_hub=True,
    hub_model_id="kasinadhsarma/vishwamai-model",
    hub_strategy="end",  # Only save at the end to save memory
    # Optimizer settings
    lr_scheduler_type="cosine",
    optim="adamw_8bit",  # Use 8-bit Adam
    # Other settings
    remove_unused_columns=False,
    seed=42,
    ddp_find_unused_parameters=False,
    # Memory optimization
    deepspeed="ds_config.json"  # Using the config we created
)



In [27]:
from datasets import concatenate_datasets

# Load and combine training datasets with memory optimization
def load_dataset_with_memory_optimization(ds_name, split):
    clear_gpu_memory()
    try:
        dataset = load_dataset(ds_name, split=split, streaming=True)  # Use streaming for memory efficiency
        return dataset
    except Exception as e:
        print(f"Failed to load {ds_name}: {e}")
        return None

train_datasets = []
for ds_name in ["gsm8k", "cais/mmlu"]:
    dataset = load_dataset_with_memory_optimization(ds_name, "train")
    if dataset is not None:
        train_datasets.append(dataset)

if not train_datasets:
    raise ValueError("No training datasets could be loaded")

combined_train_dataset = concatenate_datasets(train_datasets)

# Load validation dataset
eval_dataset = load_dataset_with_memory_optimization("cais/mmlu", "validation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

Failed to load cais/mmlu: Config name is missing.
Please pick one among the available configs: ['abstract_algebra', 'all', 'anatomy', 'astronomy', 'auxiliary_train', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 

In [33]:
@track_time
def train_model():
    trainer = VishwamAIPretrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        memory_module=memory_module,
        tree_module=tree_module,
        cache_module=cache_module,
        reward_config=reward_config
    )

    print("Starting training...")
    try:
        trainer.train()

        # Save model and components
        trainer.save_model("./final_model")
        print("Model saved successfully")

        # Push to hub with LFS
        trainer.push_to_hub(
            commit_message=f"Training completed - {time.strftime('%Y-%m-%d %H:%M:%S')}"
        )
        print("Model pushed to HuggingFace Hub")

    except Exception as e:
        print(f"Training interrupted: {e}")
        clear_gpu_memory()
        raise e

train_model()

NameError: name 'model' is not defined

In [34]:
@track_time
def save_model():
    clear_gpu_memory()
    model_save_path = "final_model"
    trainer.save_model(model_save_path)

    # Initialize Git LFS tracking for the saved model files
    !git lfs track "final_model/*.bin"
    !git lfs track "final_model/*.pt"
    !git lfs track "final_model/*.pth"

    print("Model and components saved successfully")
    return model_save_path

model_save_path = save_model()
print(f"Model available at: https://huggingface.co/kasinadhsarma/vishwamai-model")

NameError: name 'trainer' is not defined

In [None]:
@track_time
def validate_model():
    clear_gpu_memory()
    # Load all components for validation with 8-bit quantization
    test_model = Transformer(ModelArgs(**model_config))
    test_model = bnb.nn.LinearWrapper.wrap_model(test_model, device='cuda', quantize=True)
    test_model.load_state_dict(torch.load(f"{model_save_path}/pytorch_model.bin"))

    # Load auxiliary components
    test_cache = DifferentiableCacheAugmentation.from_pretrained(model_save_path)
    test_memory = ReasoningMemoryTransformer.from_pretrained(model_save_path)
    test_tree = TreeOfThoughts.from_pretrained(model_save_path)

    test_model.eval()
    test_cache.eval()
    test_memory.eval()
    test_tree.eval()

    test_cases = [
        "What is 7 * 12?",
        "Explain quantum computing in simple terms.",
        "Write a Python function to find prime numbers."
    ]

    print("Running validation tests...")
    for test_input in test_cases:
        print(f"\nTest: {test_input}")
        clear_gpu_memory()
        # Note: You'll need to implement tokenization for the actual input
        tokens = torch.randint(0, model_config['vocab_size'], (1, 32)).cuda()

        with torch.inference_mode():
            start = time.time()
            output = test_model(tokens)
            end = time.time()

            # Apply enhancements with memory management
            enhanced_states = test_cache(output)
            memory_enhanced = test_memory(enhanced_states)
            final_output = test_tree(memory_enhanced)

        print(f"Generated response in {end-start:.2f}s")
        # Note: You'll need to implement detokenization for the actual output

validate_model()
print("\nPretraining and validation completed!")