# Model Preparation Notebook

This notebook downloads, saves, and loads pre-trained models for use with the pruninghealing library.

In [1]:
import os

# Device selection
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
i = 3  # device number to use (change this to select GPU: 0, 1, 2, etc.)
os.environ["CUDA_VISIBLE_DEVICES"] = f'{i}'

from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Check device
if torch.cuda.is_available():
    device = 'cuda'
    print(f'Using GPU device {i}: {torch.cuda.get_device_name(0)}')
else:
    device = 'cpu'
    print('Using CPU')

# Set up paths
CHECKPOINTS_DIR = Path("../checkpoints")
CHECKPOINTS_DIR.mkdir(exist_ok=True)

print(f"Checkpoints directory: {CHECKPOINTS_DIR.absolute()}")

Using GPU device 3: NVIDIA A100-PCIE-40GB
Checkpoints directory: /home/ThunderstormXX/Ridiculous-LLM-Compression/polina_experiments/results/../checkpoints


## Available Models

Select from the following pre-trained models:

In [2]:
# huggingface-cli download unsloth/Llama-3.1-8B-Instruct --local-dir ./Llama-3.1-8B-Instruct --local-dir-use-symlinks False

# Available models configuration
AVAILABLE_MODELS = {
    "llama3.1-8b": "unsloth/Llama-3.1-8B-Instruct",
    "llama2-13b": "meta-llama/Llama-2-13b-hf",
    "mistral-7b": "mistralai/Mistral-7B-v0.1",
    "phi2": "microsoft/phi-2",
    "qwen-7b": "Qwen/Qwen-7B",
    "tinyllama": "unsloth/Llama-3.2-1B-Instruct"
}

print("Available models:")
for key, value in AVAILABLE_MODELS.items():
    print(f"  {key}: {value}")

Available models:
  llama3.1-8b: unsloth/Llama-3.1-8B-Instruct
  llama2-13b: meta-llama/Llama-2-13b-hf
  mistral-7b: mistralai/Mistral-7B-v0.1
  phi2: microsoft/phi-2
  qwen-7b: Qwen/Qwen-7B
  tinyllama: unsloth/Llama-3.2-1B-Instruct


## Download and Save Model

Choose a model to download and save:

In [3]:
def download_and_save_model(model_key, force_download=False):
    """Download and save model and tokenizer"""
    
    if model_key not in AVAILABLE_MODELS:
        print(f"Error: {model_key} not in available models")
        return None, None
    
    model_name = AVAILABLE_MODELS[model_key]
    save_path = CHECKPOINTS_DIR / model_key
    
    # Check if already exists
    if save_path.exists() and not force_download:
        print(f"Model {model_key} already exists at {save_path}")
        print("Set force_download=True to re-download")
        return str(save_path), str(save_path)
    
    print(f"Downloading {model_name}...")
    
    try:
        # Load tokenizer
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map=device if device != 'cpu' else None,
            trust_remote_code=True
        )
        
        # Save to checkpoints
        save_path.mkdir(exist_ok=True)
        print(f"Saving to {save_path}...")
        
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        
        print(f"✓ Successfully saved {model_key} to {save_path}")
        return str(save_path), str(save_path)
        
    except Exception as e:
        print(f"Error downloading {model_key}: {e}")
        return None, None

# Example usage - change model_key to desired model
MODEL_TO_DOWNLOAD = "tinyllama" #"llama3.1-8b"  # Change this to your desired model

model_path, tokenizer_path = download_and_save_model(MODEL_TO_DOWNLOAD)
print(f"Model saved to: {model_path}")

Downloading unsloth/Llama-3.2-1B-Instruct...
Loading tokenizer...
Loading model...
Saving to ../checkpoints/tinyllama...
✓ Successfully saved tinyllama to ../checkpoints/tinyllama
Model saved to: ../checkpoints/tinyllama


## Load Saved Model

Load a previously saved model from checkpoints:

In [4]:
def load_saved_model(model_key):
    """Load model and tokenizer from checkpoints directory"""
    
    model_path = CHECKPOINTS_DIR / model_key
    
    if not model_path.exists():
        print(f"Error: Model {model_key} not found in {CHECKPOINTS_DIR}")
        print("Available models:")
        for p in CHECKPOINTS_DIR.iterdir():
            if p.is_dir():
                print(f"  - {p.name}")
        return None, None
    
    try:
        print(f"Loading {model_key} from {model_path}...")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map=device if device != 'cpu' else None,
            trust_remote_code=True
        )
        
        print(f"✓ Successfully loaded {model_key}")
        print(f"Model type: {model.config.model_type}")
        print(f"Number of layers: {len(model.model.layers)}")
        print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"Error loading {model_key}: {e}")
        return None, None

# Example usage
MODEL_TO_LOAD = "tinyllama" #"llama3.1-8b"  # Change this to your desired model

model, tokenizer = load_saved_model(MODEL_TO_LOAD)

Loading tinyllama from ../checkpoints/tinyllama...
✓ Successfully loaded tinyllama
Model type: llama
Number of layers: 16
Parameters: 1235.8M


In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-0

In [10]:
layer = model.model.layers[1]
layer

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
    (k_proj): Linear(in_features=2048, out_features=512, bias=False)
    (v_proj): Linear(in_features=2048, out_features=512, bias=False)
    (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
    (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
    (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
)

In [17]:
model.config

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.54.0",
  "unsloth_fixed": true,
  "use_cache": true,
  "vocab_size": 128256
}

In [15]:
import inspect
layer_class = layer.__class__

print(inspect.getsource(layer_class))

class LlamaDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: LlamaConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx)

        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torc

In [19]:
attn = layer.self_attn
config = attn.config

# Попробуй так:
num_heads = getattr(config, 'num_attention_heads', getattr(config, 'n_heads', None))
assert num_heads is not None, "Не найдено число голов в config"
head_dim = config.hidden_size // num_heads

# 14-я голова (индекс 13)
idx = 13
q_proj_weight = attn.q_proj.weight  # [embed_dim, hidden_size]
q_weight_14 = q_proj_weight[idx*head_dim:(idx+1)*head_dim, :]

In [22]:
q_proj_weight.shape

torch.Size([2048, 2048])

In [23]:
q_weight_14.shape


torch.Size([64, 2048])