In [None]:
# Install Git LFS
!apt-get install git-lfs -y
!git lfs install

# Clone the repository
!git clone https://github.com/VishwamAI/VishwamAI.git
%cd VishwamAI

# Install the package
!pip install -e . -q


# Configure Git LFS
!git config lfs.url https://huggingface.co/kasinadhsarma/vishwamai-model.git/info/lfs
!git config lfs.pushurl https://huggingface.co/kasinadhsarma/vishwamai-model.git/info/lfs

# Set up Git LFS tracking
!git lfs track "*.bin"
!git lfs track "*.pt"
!git lfs track "*.pth"
!git lfs track "*.ckpt"
!git lfs track "*.safetensors"

In [None]:
# First cell - Add all required imports
import gc
import time
import json
import torch
import torch.nn as nn 
import torch.distributed as dist
from tqdm.notebook import tqdm
import bitsandbytes as bnb
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments

# Import VishwamAI components
from vishwamai.model import Transformer, ModelArgs
from vishwamai.model_utils import get_gpu_memory, load_model
from vishwamai.cache_augmentation import CacheConfig, DifferentiableCacheAugmentation  
from vishwamai.neural_memory import ReasoningMemoryTransformer
from vishwamai.tree_of_thoughts import TreeOfThoughts
from vishwamai.reward_function import RewardConfig
from vishwamai.trainer import VishwamAIPretrainer

# Define GPU memory utility function 
def clear_gpu_memory():
    """Clear GPU memory cache"""
    gc.collect()
    torch.cuda.empty_cache()

# Initialize components before using
model = None 
cache_module = None
memory_module = None
tree_module = None
reward_config = None
train_dataset = None
eval_dataset = None

In [None]:
%%time
# Install Git LFS
!apt-get install git-lfs
!git lfs install

In [None]:
%%time
# Verify GPU availability and requirements
!nvidia-smi

import torch
import gc

# Memory optimization for T4
def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()

gpu_name = torch.cuda.get_device_name(0)
print(f"Using GPU: {gpu_name}")

# Set memory optimization flags for T4
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
%%time
# Package installation with T4 optimized versions
%pip install torch==2.1.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 \
    transformers==4.34.0 datasets accelerate huggingface_hub wandb bitsandbytes -q
%pip install deepspeed

In [None]:
%%time
from huggingface_hub import login, create_repo
from getpass import getpass
import wandb
import os

# Get token securely
hf_token = getpass("Enter your Hugging Face access token: ")
login(token=hf_token)
print("Successfully logged in to Hugging Face!")

# Initialize W&B for experiment tracking
wandb.login()
print("Successfully logged in to Weights & Biases!")

In [None]:
pip install datasets bitsandbytes

In [None]:
%%time
import torch
import json
from datasets import load_dataset, concatenate_datasets
from vishwamai.model_utils import load_model, get_gpu_memory
from vishwamai.model import Transformer, ModelArgs
from vishwamai.cache_augmentation import CacheConfig, DifferentiableCacheAugmentation
from vishwamai.neural_memory import ReasoningMemoryTransformer
from vishwamai.tree_of_thoughts import TreeOfThoughts
from vishwamai.reward_function import RewardConfig
from vishwamai.trainer import VishwamAIPretrainer

# T4-specific performance optimizations
import bitsandbytes as bnb
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('high')

In [None]:
@track_time
def setup_hardware():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = get_gpu_memory()
    print(f"Using GPU: {gpu_name} ({gpu_memory:.1f} GB)")

    # T4-optimized configuration
    if 't4' in gpu_name.lower():
        variant = "7B"  # T4-optimized model
        print("Using T4-optimized configuration with 8-bit quantization")
    else:
        variant = "167B"  # Fallback configuration
        print("Using fallback configuration")

    clear_gpu_memory()
    return variant

model_variant = setup_hardware()

In [None]:
@track_time
def load_config():
    config_path = "./vishwamai/configs/config_optimized.json"
    with open(config_path) as f:
        config = json.load(f)

    if model_variant not in config["model_variants"]:
        print(f"Warning: Model variant '{model_variant}' not found in config, creating T4 optimized config")
        # T4-optimized configuration
        t4_config = {
            "max_batch_size": 4,
            "max_seq_len": 2048,
            "dtype": "fp8",
            "vocab_size": 32000,
            "dim": 1024,
            "inter_dim": 2816,
            "moe_inter_dim": 512,
            "n_layers": 12,
            "n_dense_layers": 1,
            "n_heads": 16,
            "n_routed_experts": 8,
            "n_shared_experts": 1,
            "n_activated_experts": 2,
            "n_expert_groups": 1,
            "n_limited_groups": 1,
            "score_func": "softmax",
            "route_scale": 1.0,
            "q_lora_rank": 0,
            "kv_lora_rank": 64,
            "qk_nope_head_dim": 64,
            "qk_rope_head_dim": 32,
            "v_head_dim": 64,
            "original_seq_len": 2048,
            "rope_theta": 10000.0,
            "rope_factor": 20,
            "beta_fast": 16,
            "beta_slow": 1,
            "mscale": 0.5,
            "use_alibi": False,  # Disable ALiBi for T4
            "use_rope_scaling": True,
            "gradient_checkpointing": True,
            "parallel_attn": True,
            "rope_condense_ratio": 1.0
        }
        return t4_config

    return config["model_variants"][model_variant]["model_config"]

# Load configuration
model_config = load_config()
print("Configuration loaded successfully.")

In [None]:
# Create DeepSpeed config for T4 optimization
ds_config = {
    "fp16": {
        "enabled": True,
        "loss_scale": 0,
        "loss_scale_window": 100,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "zero_optimization": {
        "stage": 2,
        "allgather_bucket_size": 5e8,
        "reduce_bucket_size": 5e8,
        "overlap_comm": True,
        "contiguous_gradients": True,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        }
    },
    "train_batch_size": 32,
    "gradient_accumulation_steps": 16,
    "train_micro_batch_size_per_gpu": 2,
    "gradient_clipping": 0.5,
    "steps_per_print": 10,
    "wall_clock_breakdown": False
}

with open('ds_config.json', 'w') as f:
    json.dump(ds_config, f)

In [None]:
@track_time
def initialize_components():
    print("Initializing model and components...")
    clear_gpu_memory()

    # Initialize main model with 8-bit quantization for T4
    model_args = ModelArgs(**model_config)
    model = Transformer(model_args)

    # Replace LinearWrapper with current bitsandbytes 8-bit quantization
    import bitsandbytes as bnb
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            model._modules[name] = bnb.nn.Linear8bitLt(
                module.in_features,
                module.out_features,
                module.bias is not None,
                has_fp16_weights=False,
                threshold=6.0
            )
    model = model.cuda()

    # Initialize smaller cache augmentation for T4
    cache_config = CacheConfig(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"],
        max_cache_length=8192,  # Reduced cache size for T4
        dropout=0.1
    )
    cache_module = DifferentiableCacheAugmentation(cache_config).cuda()

    # Initialize advanced memory transformer
    memory_config = AdvancedMemoryConfig(
        hidden_size=model_config["dim"],
        num_attention_heads=model_config["n_heads"],
        memory_size=8192,  # Adjust based on available GPU memory
        use_hierarchical=True,
        use_compressed=True
    )
    memory_module = AdvancedReasoningMemoryTransformer(memory_config).cuda()

    # Initialize tree of thoughts with reduced beam size
    tree_module = TreeOfThoughts(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"]
    ).cuda()

    # Initialize reward config
    reward_config = RewardConfig(
        hidden_size=model_config["dim"],
        num_heads=model_config["n_heads"]
    )

    clear_gpu_memory()
    return model, cache_module, memory_module, tree_module, reward_config


In [None]:
from transformers import TrainingArguments

# Initialize output directory
output_dir = "./pretrain_output"
!mkdir -p $output_dir

# Configure training with T4 optimizations
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduced batch size for T4
    gradient_accumulation_steps=16,  # Increased for T4 memory constraints
    learning_rate=5e-5,  # Reduced learning rate
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=5,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    # Mixed precision training
    fp16=True,  # Use FP16 instead of BF16 for T4
    bf16=False,
    # Performance optimizations
    gradient_checkpointing=True,
    dataloader_num_workers=2,  # Reduced workers for T4
    dataloader_pin_memory=True,
    group_by_length=True,
    # Memory optimizations
    max_grad_norm=0.5,  # Reduced for stability
    # Monitoring
    report_to=["tensorboard", "wandb"],
    # Hub integration
    push_to_hub=True,
    hub_model_id="kasinadhsarma/vishwamai-model",
    hub_strategy="end",  # Only save at the end to save memory
    # Optimizer settings
    lr_scheduler_type="cosine",
    optim="adamw_8bit",  # Use 8-bit Adam
    # Other settings
    remove_unused_columns=False,
    seed=42,
    ddp_find_unused_parameters=False,
    # Memory optimization
    deepspeed="ds_config.json"  # Using the config we created
)

In [None]:
from datasets import concatenate_datasets

# Load and combine training datasets with memory optimization
def load_dataset_with_memory_optimization(ds_name, split):
    clear_gpu_memory()
    try:
        dataset = load_dataset(ds_name, split=split, streaming=True)  # Use streaming for memory efficiency
        return dataset
    except Exception as e:
        print(f"Failed to load {ds_name}: {e}")
        return None

train_datasets = []
for ds_name in ["gsm8k", "cais/mmlu"]:
    dataset = load_dataset_with_memory_optimization(ds_name, "train")
    if dataset is not None:
        train_datasets.append(dataset)

if not train_datasets:
    raise ValueError("No training datasets could be loaded")

combined_train_dataset = concatenate_datasets(train_datasets)

# Load validation dataset
eval_dataset = load_dataset_with_memory_optimization("cais/mmlu", "validation")

In [None]:
# Initialize model and components before training
print("Initializing components...")
model, cache_module, memory_module, tree_module, reward_config = initialize_components()
print("Components initialized successfully")

# Make sure datasets are defined
if 'train_dataset' not in locals() or 'eval_dataset' not in locals():
    print("Loading datasets...")
    train_dataset = load_dataset("gsm8k", split="train", streaming=True)
    eval_dataset = load_dataset("cais/mmlu", split="validation", streaming=True)
    print("Datasets loaded successfully")

print("\nStarting model training...")
@track_time
def train_model():
    global trainer  # Make trainer accessible globally
    trainer = VishwamAIPretrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        memory_module=memory_module,
        tree_module=tree_module, 
        cache_module=cache_module,
        reward_config=reward_config
    )

    try:
        trainer.train()
        trainer.save_model("./final_model")
        trainer.push_to_hub(
            commit_message=f"Training completed - {time.strftime('%Y-%m-%d %H:%M:%S')}"
        )
        print("Model training and saving completed successfully")
        return trainer
    except Exception as e:
        print(f"Training interrupted: {e}")
        clear_gpu_memory()
        raise e

trainer = train_model()  # Store trainer instance for later use

In [None]:
@track_time
def train_model():
    trainer = VishwamAIPretrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        memory_module=memory_module,
        tree_module=tree_module,
        cache_module=cache_module,
        reward_config=reward_config
    )

    print("Starting training...")
    try:
        trainer.train()

        # Save model and components
        trainer.save_model("./final_model")
        print("Model saved successfully")

        # Push to hub with LFS
        trainer.push_to_hub(
            commit_message=f"Training completed - {time.strftime('%Y-%m-%d %H:%M:%S')}"
        )
        print("Model pushed to HuggingFace Hub")

    except Exception as e:
        print(f"Training interrupted: {e}")
        clear_gpu_memory()
        raise e

train_model()

In [None]:
@track_time
def save_model():
    clear_gpu_memory()
    model_save_path = "final_model"
    trainer.save_model(model_save_path)

    # Initialize Git LFS tracking for the saved model files
    !git lfs track "final_model/*.bin"
    !git lfs track "final_model/*.pt"
    !git lfs track "final_model/*.pth"

    print("Model and components saved successfully")
    return model_save_path

model_save_path = save_model()
print(f"Model available at: https://huggingface.co/kasinadhsarma/vishwamai-model")

In [None]:
@track_time
def validate_model():
    clear_gpu_memory()
    # Load all components for validation with 8-bit quantization
    test_model = Transformer(ModelArgs(**model_config))
    test_model = bnb.nn.LinearWrapper.wrap_model(test_model, device='cuda', quantize=True)
    test_model.load_state_dict(torch.load(f"{model_save_path}/pytorch_model.bin"))

    # Load auxiliary components
    test_cache = DifferentiableCacheAugmentation.from_pretrained(model_save_path)
    test_memory = ReasoningMemoryTransformer.from_pretrained(model_save_path)
    test_tree = TreeOfThoughts.from_pretrained(model_save_path)

    test_model.eval()
    test_cache.eval()
    test_memory.eval()
    test_tree.eval()

    test_cases = [
        "What is 7 * 12?",
        "Explain quantum computing in simple terms.",
        "Write a Python function to find prime numbers."
    ]

    print("Running validation tests...")
    for test_input in test_cases:
        print(f"\nTest: {test_input}")
        clear_gpu_memory()
        # Note: You'll need to implement tokenization for the actual input
        tokens = torch.randint(0, model_config['vocab_size'], (1, 32)).cuda()

        with torch.inference_mode():
            start = time.time()
            output = test_model(tokens)
            end = time.time()

            # Apply enhancements with memory management
            enhanced_states = test_cache(output)
            memory_enhanced = test_memory(enhanced_states)
            final_output = test_tree(memory_enhanced)

        print(f"Generated response in {end-start:.2f}s")
        # Note: You'll need to implement detokenization for the actual output

validate_model()
print("\nPretraining and validation completed!")