# Model Preparation Notebook

This notebook downloads, saves, and loads pre-trained models for use with the pruninghealing library.

In [1]:
import os

# Device selection
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
i = 3  # device number to use (change this to select GPU: 0, 1, 2, etc.)
os.environ["CUDA_VISIBLE_DEVICES"] = f'{i}'

from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Check device
if torch.cuda.is_available():
    device = 'cuda'
    print(f'Using GPU device {i}: {torch.cuda.get_device_name(0)}')
else:
    device = 'cpu'
    print('Using CPU')

# Set up paths
CHECKPOINTS_DIR = Path("../checkpoints")
CHECKPOINTS_DIR.mkdir(exist_ok=True)

print(f"Checkpoints directory: {CHECKPOINTS_DIR.absolute()}")

Using GPU device 3: NVIDIA A100-PCIE-40GB
Checkpoints directory: /home/ThunderstormXX/Ridiculous-LLM-Compression/src/notebooks/../checkpoints


## Available Models

Select from the following pre-trained models:

In [2]:
# huggingface-cli download unsloth/Llama-3.1-8B-Instruct --local-dir ./Llama-3.1-8B-Instruct --local-dir-use-symlinks False

# Available models configuration
AVAILABLE_MODELS = {
    "llama3.1-8b": "unsloth/Llama-3.1-8B-Instruct",
    "llama2-13b": "meta-llama/Llama-2-13b-hf",
    "mistral-7b": "mistralai/Mistral-7B-v0.1",
    "phi2": "microsoft/phi-2",
    "qwen-7b": "Qwen/Qwen-7B",
    "tinyllama": "unsloth/Llama-3.2-1B-Instruct"
}

print("Available models:")
for key, value in AVAILABLE_MODELS.items():
    print(f"  {key}: {value}")

Available models:
  llama3.1-8b: unsloth/Llama-3.1-8B-Instruct
  llama2-13b: meta-llama/Llama-2-13b-hf
  mistral-7b: mistralai/Mistral-7B-v0.1
  phi2: microsoft/phi-2
  qwen-7b: Qwen/Qwen-7B
  tinyllama: unsloth/Llama-3.2-1B-Instruct


## Download and Save Model

Choose a model to download and save:

In [3]:
def download_and_save_model(model_key, force_download=False):
    """Download and save model and tokenizer"""
    
    if model_key not in AVAILABLE_MODELS:
        print(f"Error: {model_key} not in available models")
        return None, None
    
    model_name = AVAILABLE_MODELS[model_key]
    save_path = CHECKPOINTS_DIR / model_key
    
    # Check if already exists
    if save_path.exists() and not force_download:
        print(f"Model {model_key} already exists at {save_path}")
        print("Set force_download=True to re-download")
        return str(save_path), str(save_path)
    
    print(f"Downloading {model_name}...")
    
    try:
        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map=device if device != 'cpu' else None,
            trust_remote_code=True
        )
        
        # Save to checkpoints
        save_path.mkdir(exist_ok=True)
        print(f"Saving to {save_path}...")
        
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        
        print(f"✓ Successfully saved {model_key} to {save_path}")
        return str(save_path), str(save_path)
        
    except Exception as e:
        print(f"Error downloading {model_key}: {e}")
        return None, None
    
def load_saved_model(model_key):
    """Load model and tokenizer from checkpoints directory"""
    
    model_path = CHECKPOINTS_DIR / model_key
    
    if not model_path.exists():
        print(f"Error: Model {model_key} not found in {CHECKPOINTS_DIR}")
        print("Available models:")
        for p in CHECKPOINTS_DIR.iterdir():
            if p.is_dir():
                print(f"  - {p.name}")
        return None, None
    
    try:
        print(f"Loading {model_key} from {model_path}...")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map=device if device != 'cpu' else None,
            trust_remote_code=True
        )
        
        print(f"✓ Successfully loaded {model_key}")
        print(f"Model type: {model.config.model_type}")
        print(f"Number of layers: {len(model.model.layers)}")
        print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"Error loading {model_key}: {e}")
        return None, None

In [4]:
# Example usage - change model_key to desired model
MODEL_TO_DOWNLOAD = "llama3.1-8b"#"tinyllama" #  # Change this to your desired model

model_path, tokenizer_path = download_and_save_model(MODEL_TO_DOWNLOAD)
print(f"Model saved to: {model_path}")

Model llama3.1-8b already exists at ../checkpoints/llama3.1-8b
Set force_download=True to re-download
Model saved to: ../checkpoints/llama3.1-8b


## Load Saved Model

Load a previously saved model from checkpoints:

In [5]:

# Example usage
MODELS_TO_LOAD = ["llama3.1-8b","tinyllama"] #  # Change this to your desired model

loaded_models = [load_saved_model(MODEL_TO_LOAD) for MODEL_TO_LOAD in MODELS_TO_LOAD]

Loading llama3.1-8b from ../checkpoints/llama3.1-8b...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Successfully loaded llama3.1-8b
Model type: llama
Number of layers: 32
Parameters: 8030.3M
Loading tinyllama from ../checkpoints/tinyllama...
✓ Successfully loaded tinyllama
Model type: llama
Number of layers: 16
Parameters: 1235.8M


In [7]:
for model, _ in loaded_models:
    print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps

## Test Model Inference

Quick test to verify the loaded model works:

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [15]:
def test_model_inference(model, tokenizer, prompt="Hello, how are you?"):
    """Test model with simple inference"""
    
    if model is None or tokenizer is None:
        print("Model or tokenizer not loaded")
        return
    
    print(f"Testing with prompt: '{prompt}'")
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Response: {response}")

# Test the loaded model
for model_tokenizer, name in zip(loaded_models,MODELS_TO_LOAD):
    model, tokenizer = model_tokenizer
    print('--------------------------------', name)
    if model is not None:
        test_model_inference(model, tokenizer)

-------------------------------- llama3.1-8b
Testing with prompt: 'Hello, how are you?'
Response: Hello, how are you? It seems like a simple greeting, but the meanings and interpretations of these two questions are complex and can influence our daily lives. In this article, we’ll explore the nuances of these simple questions and how they can affect our well-being, relationships, and
-------------------------------- tinyllama
Testing with prompt: 'Hello, how are you?'
Response: Hello, how are you? I'm just a simple AI assistant, I don't have feelings or emotions like humans do, but I'm here to help you with any questions or tasks you may have.

You could also like to start by telling me a bit about yourself. What


## List Available Models

Check what models are currently saved in checkpoints:

In [16]:
def list_saved_models():
    """List all models in checkpoints directory"""
    
    print(f"Models in {CHECKPOINTS_DIR}:")
    
    saved_models = []
    for path in CHECKPOINTS_DIR.iterdir():
        if path.is_dir():
            # Check if it contains model files
            if (path / "config.json").exists():
                size_mb = sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) / (1024**2)
                saved_models.append((path.name, size_mb))
    
    if saved_models:
        for name, size in saved_models:
            print(f"  - {name} ({size:.1f} MB)")
    else:
        print("  No models found")
    
    return [name for name, _ in saved_models]

saved_models = list_saved_models()

Models in ../checkpoints:
  - tinyllama (2373.6 MB)
  - llama3.1-8b (15333.0 MB)
  - tinyllama_p_window (2048.1 MB)


## Calculate perplexiry

In [17]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath(".."))  # добавить родительскую папку

from pruninghealing.utils import calculate_perplexity
from datasets import load_from_disk

# Загружаем закэшированный датасет
dataset_path = "../../cached_dataset"
dataset = load_from_disk(dataset_path)


In [18]:
dataset.keys()

dict_keys(['train', 'validation'])

In [22]:
for model_tokenizer, name in zip(loaded_models,MODELS_TO_LOAD):
    model, tokenizer = model_tokenizer
    print('--------------------------------', name)

    # Обычная перплексия
    ppl = calculate_perplexity(model, tokenizer, dataset['validation'], normalized = False)
    print(f"Perplexity: {ppl:.3f}")

    # Нормализованная перплексия
    norm_ppl = calculate_perplexity(model, tokenizer,dataset['validation'])
    print(f"Normalized Perplexity: {norm_ppl:.3f}")


-------------------------------- llama3.1-8b


Perplexity:   0%|          | 0/1000 [00:00<?, ?sample/s, avg_loss=2.0562, ppl=7.82]

Perplexity: 100%|██████████| 1000/1000 [01:02<00:00, 16.13sample/s, avg_loss=2.5120, ppl=12.33]


Perplexity: 12.329


Perplexity: 100%|██████████| 1000/1000 [00:59<00:00, 16.67sample/s, avg_loss=2.5120, norm_loss=0.2136]


Normalized Perplexity: 0.214
-------------------------------- tinyllama


Perplexity: 100%|██████████| 1000/1000 [00:25<00:00, 38.48sample/s, avg_loss=3.1310, ppl=22.90]


Perplexity: 22.897


Perplexity: 100%|██████████| 1000/1000 [00:26<00:00, 37.08sample/s, avg_loss=3.1310, norm_loss=0.2662]


Normalized Perplexity: 0.266
