# Model Preparation Notebook

This notebook downloads, saves, and loads pre-trained models for use with the pruninghealing library.

In [1]:
import os

# Device selection
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
i = 3  # device number to use (change this to select GPU: 0, 1, 2, etc.)
os.environ["CUDA_VISIBLE_DEVICES"] = f'{i}'

from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Check device
if torch.cuda.is_available():
    device = 'cuda'
    print(f'Using GPU device {i}: {torch.cuda.get_device_name(0)}')
else:
    device = 'cpu'
    print('Using CPU')

# Set up paths
CHECKPOINTS_DIR = Path("../checkpoints")
CHECKPOINTS_DIR.mkdir(exist_ok=True)

print(f"Checkpoints directory: {CHECKPOINTS_DIR.absolute()}")

Using GPU device 3: NVIDIA A100-PCIE-40GB
Checkpoints directory: /home/ThunderstormXX/Ridiculous-LLM-Compression/src/notebooks/../checkpoints


## Available Models

Select from the following pre-trained models:

In [2]:
# huggingface-cli download unsloth/Llama-3.1-8B-Instruct --local-dir ./Llama-3.1-8B-Instruct --local-dir-use-symlinks False

# Available models configuration
AVAILABLE_MODELS = {
    "llama3.1-8b": "unsloth/Llama-3.1-8B-Instruct",
    "llama2-13b": "meta-llama/Llama-2-13b-hf",
    "mistral-7b": "mistralai/Mistral-7B-v0.1",
    "phi2": "microsoft/phi-2",
    "qwen-7b": "Qwen/Qwen-7B",
    "tinyllama": "unsloth/Llama-3.2-1B-Instruct"
}

print("Available models:")
for key, value in AVAILABLE_MODELS.items():
    print(f"  {key}: {value}")

Available models:
  llama3.1-8b: unsloth/Llama-3.1-8B-Instruct
  llama2-13b: meta-llama/Llama-2-13b-hf
  mistral-7b: mistralai/Mistral-7B-v0.1
  phi2: microsoft/phi-2
  qwen-7b: Qwen/Qwen-7B
  tinyllama: unsloth/Llama-3.2-1B-Instruct


## Download and Save Model

Choose a model to download and save:

In [4]:
def download_and_save_model(model_key, force_download=False):
    """Download and save model and tokenizer"""
    
    if model_key not in AVAILABLE_MODELS:
        print(f"Error: {model_key} not in available models")
        return None, None
    
    model_name = AVAILABLE_MODELS[model_key]
    save_path = CHECKPOINTS_DIR / model_key
    
    # Check if already exists
    if save_path.exists() and not force_download:
        print(f"Model {model_key} already exists at {save_path}")
        print("Set force_download=True to re-download")
        return str(save_path), str(save_path)
    
    print(f"Downloading {model_name}...")
    
    try:
        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map=device if device != 'cpu' else None,
            trust_remote_code=True
        )
        
        # Save to checkpoints
        save_path.mkdir(exist_ok=True)
        print(f"Saving to {save_path}...")
        
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        
        print(f"✓ Successfully saved {model_key} to {save_path}")
        return str(save_path), str(save_path)
        
    except Exception as e:
        print(f"Error downloading {model_key}: {e}")
        return None, None

# Example usage - change model_key to desired model
MODEL_TO_DOWNLOAD = "tinyllama" #"llama3.1-8b"  # Change this to your desired model

model_path, tokenizer_path = download_and_save_model(MODEL_TO_DOWNLOAD)
print(f"Model saved to: {model_path}")

Downloading unsloth/Llama-3.2-1B-Instruct...
Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Loading model...


config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Saving to ../checkpoints/tinyllama...
✓ Successfully saved tinyllama to ../checkpoints/tinyllama
Model saved to: ../checkpoints/tinyllama


## Load Saved Model

Load a previously saved model from checkpoints:

In [3]:
def load_saved_model(model_key):
    """Load model and tokenizer from checkpoints directory"""
    
    model_path = CHECKPOINTS_DIR / model_key
    
    if not model_path.exists():
        print(f"Error: Model {model_key} not found in {CHECKPOINTS_DIR}")
        print("Available models:")
        for p in CHECKPOINTS_DIR.iterdir():
            if p.is_dir():
                print(f"  - {p.name}")
        return None, None
    
    try:
        print(f"Loading {model_key} from {model_path}...")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map=device if device != 'cpu' else None,
            trust_remote_code=True
        )
        
        print(f"✓ Successfully loaded {model_key}")
        print(f"Model type: {model.config.model_type}")
        print(f"Number of layers: {len(model.model.layers)}")
        print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"Error loading {model_key}: {e}")
        return None, None

# Example usage
MODEL_TO_LOAD = "tinyllama" #"llama3.1-8b"  # Change this to your desired model

model, tokenizer = load_saved_model(MODEL_TO_LOAD)

Loading tinyllama from ../checkpoints/tinyllama...
✓ Successfully loaded tinyllama
Model type: llama
Number of layers: 16
Parameters: 1235.8M


In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-0

## Test Model Inference

Quick test to verify the loaded model works:

In [6]:
def test_model_inference(model, tokenizer, prompt="Hello, how are you?"):
    """Test model with simple inference"""
    
    if model is None or tokenizer is None:
        print("Model or tokenizer not loaded")
        return
    
    print(f"Testing with prompt: '{prompt}'")
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Response: {response}")

# Test the loaded model
if model is not None:
    test_model_inference(model, tokenizer)

Testing with prompt: 'Hello, how are you?'
Response: Hello, how are you? I'm feeling a bit stuck and need some guidance. I've been trying to learn a new skill, but I just can't seem to get past the initial excitement and get any progress. It feels like I'm just going through the motions.

I


## List Available Models

Check what models are currently saved in checkpoints:

In [7]:
def list_saved_models():
    """List all models in checkpoints directory"""
    
    print(f"Models in {CHECKPOINTS_DIR}:")
    
    saved_models = []
    for path in CHECKPOINTS_DIR.iterdir():
        if path.is_dir():
            # Check if it contains model files
            if (path / "config.json").exists():
                size_mb = sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) / (1024**2)
                saved_models.append((path.name, size_mb))
    
    if saved_models:
        for name, size in saved_models:
            print(f"  - {name} ({size:.1f} MB)")
    else:
        print("  No models found")
    
    return [name for name, _ in saved_models]

saved_models = list_saved_models()

Models in ../checkpoints:
  - tinyllama (2373.6 MB)
  - llama3.1-8b (15333.0 MB)


## Usage with PruningHealing Library

Example of how to use saved models with the pruninghealing library:

In [7]:
# Example integration with pruninghealing library
import sys
sys.path.append('../..')

from src.pruninghealing import Trainer, DatasetLoader, IterativePruner
from src.pruninghealing.utils import calculate_perplexity

def prepare_model_for_pruning(model_key):
    """Load model and prepare for pruning experiments"""
    
    # Load model from checkpoints
    model, tokenizer = load_saved_model(model_key)
    
    if model is None:
        return None, None, None
    
    # Calculate baseline perplexity
    print("Calculating baseline perplexity...")
    baseline_ppl = calculate_perplexity(model, tokenizer)
    print(f"Baseline perplexity: {baseline_ppl:.3f}")
    
    # Create dataset loader
    dataset_loader = DatasetLoader(tokenizer)
    dataset_loader.load_wikitext()
    
    # Create pruner
    pruner = IterativePruner(model, tokenizer, workspace_dir=f"../../workspace/{model_key}")
    
    print(f"✓ Model {model_key} ready for pruning experiments")
    return model, tokenizer, pruner

# Example usage
if saved_models:
    example_model = saved_models[0]
    print(f"Preparing {example_model} for pruning...")
    model, tokenizer, pruner = prepare_model_for_pruning(example_model)
else:
    print("No saved models available. Download a model first.")

Preparing llama3.1-8b for pruning...
Loading llama3.1-8b from ../checkpoints/llama3.1-8b...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Successfully loaded llama3.1-8b
Model type: llama
Number of layers: 32
Parameters: 8030.3M
Calculating baseline perplexity...


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Baseline perplexity: 11.105


Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

✓ Model llama3.1-8b ready for pruning experiments


## Batch Download Multiple Models

Download multiple models at once:

In [8]:
def batch_download_models(model_keys, force_download=False):
    """Download multiple models"""
    
    results = {}
    
    for model_key in model_keys:
        print(f"\n{'='*50}")
        print(f"Processing {model_key}...")
        print(f"{'='*50}")
        
        model_path, tokenizer_path = download_and_save_model(model_key, force_download)
        results[model_key] = {
            'success': model_path is not None,
            'path': model_path
        }
    
    print(f"\n{'='*50}")
    print("BATCH DOWNLOAD SUMMARY")
    print(f"{'='*50}")
    
    for model_key, result in results.items():
        status = "✓" if result['success'] else "✗"
        print(f"{status} {model_key}: {result['path'] if result['success'] else 'Failed'}")
    
    return results

# Example: Download small models for testing
# Uncomment to download multiple models
# models_to_download = ["tinyllama", "phi2"]
# batch_results = batch_download_models(models_to_download)

print("Batch download function ready. Uncomment above lines to use.")

Batch download function ready. Uncomment above lines to use.
