In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import torch.nn as nn
from collections import Counter
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.tensorboard import SummaryWriter
import time
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm
torch.cuda.is_available()
from copy import deepcopy
from transformers import pipeline
import torch.nn as nn
from transformers import LlamaForCausalLM
import torch
import torch.nn as nn
from collections import Counter
from copy import deepcopy

from perplexity_eval import evaluate_model_perplexity  # убедись, что он доступен

In [21]:
def prune_layers(model, layers_to_remove):
    """
    Удаляет указанные слои из модели LlamaForCausalLM.

    Args:
        model (LlamaForCausalLM): Исходная модель.
        layers_to_remove (list): Индексы слоёв, которые нужно удалить.

    Returns:
        LlamaForCausalLM: Новая модель с удалёнными слоями.
    """
    new_model = LlamaForCausalLM(config=deepcopy(model.config))
    
    with torch.no_grad():
        # Копируем эмбеддинги
        new_model.model.embed_tokens.weight.data = model.model.embed_tokens.weight.data.clone()
        
        # Копируем оставшиеся слои энкодера
        new_layers = [
            layer for idx, layer in enumerate(model.model.layers) if idx not in layers_to_remove
        ]
        new_model.model.layers = torch.nn.ModuleList(new_layers)
        
        # Копируем параметры нормализации
        new_model.model.norm.weight.data = model.model.norm.weight.data.clone()
        
        # === ОБНОВИ config перед сохранением ===
        new_model.config.num_hidden_layers = len(new_model.model.layers)

        # Копируем lm_head
        new_model.lm_head.weight.data = model.lm_head.weight.data.clone()

    return new_model.to(model.device)

## BASE MODEL

In [22]:
local_model_path = "/home/user31/polina/Llama-3.2-3B-Instruct"

# === Загрузка модели и токенизатора ===
model = LlamaForCausalLM.from_pretrained(
    local_model_path,
    device_map="cuda:0",
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
model.model.layers[15]

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
    (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
    (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
    (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
    (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
    (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
)

In [24]:
# evaluate_model_perplexity(model,tokenizer)

## PRUNING

In [25]:
pruned_model = prune_layers(model , layers_to_remove=[17])

In [26]:
pruned_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-26): 27 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e

In [27]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e

In [28]:
from peft import get_peft_model, LoraConfig, TaskType

def apply_lora_to_layers(
    model,
    layer_indices,
    r=8,
    alpha=32,
    dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
):
    """
    Навешивает LoRA на указанные индексы слоёв модели LLaMA.
    
    :param model: Модель типа LlamaForCausalLM
    :param layer_indices: Список индексов слоёв, на которые нужно навесить LoRA
    :param r: ранк LoRA
    :param alpha: параметр alpha
    :param dropout: dropout для LoRA
    :param bias: тип bias в LoRA ('none', 'all', 'lora_only')
    :param task_type: тип задачи для PEFT (обычно TaskType.CAUSAL_LM)
    :return: модель с LoRA адаптерами
    """
    target_modules = []
    for layer_idx in layer_indices:
        prefix = f"model.layers.{layer_idx}"
        target_modules.extend([
            f"{prefix}.self_attn.q_proj",
            f"{prefix}.self_attn.k_proj",
            f"{prefix}.self_attn.v_proj",
            f"{prefix}.self_attn.o_proj",
            f"{prefix}.mlp.gate_proj",
            f"{prefix}.mlp.up_proj",
            f"{prefix}.mlp.down_proj",
        ])
    
    peft_config = LoraConfig(
        r=r,
        lora_alpha=alpha,
        target_modules=target_modules,
        lora_dropout=dropout,
        bias=bias,
        task_type=task_type
    )
    
    return get_peft_model(model, peft_config)


In [None]:

lora_pruned_model = apply_lora_to_layers(
    pruned_model,
    layer_indices = ,
    r=8,
    alpha=32,
    dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
evaluate_model_perplexity(pruned_model,tokenizer, num_samples = 400)

--- Running Perplexity Evaluation on device: cuda ---
Loading dataset: 'wikitext' (config: 'wikitext-103-raw-v1', split: 'test')...
Using 50 samples from the dataset.
Tokenizing the text...
Calculating perplexity with max_length=1024 and stride=512...


Calculating Perplexity: 100%|█████████████████████████████████████████| 5/5 [00:02<00:00,  2.16it/s]


--------------------------------------------------
✅ Final Perplexity: 14.1014
--------------------------------------------------


14.101369857788086