In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm
torch.cuda.is_available()
from copy import deepcopy



# local_model_path = "/home/user31/polina/Llama-3.2-3B-Instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     local_model_path,
#     device_map="cuda:3",
#     # load_in_4bit=True,            # активируем 4-bit квантование
#     torch_dtype="auto"            # автоматически выбираем тип данных
# )

# tokenizer = AutoTokenizer.from_pretrained(local_model_path)



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from transformers import EvalPrediction
from datasets import load_dataset
from transformers import pipeline
import evaluate

import warnings
import transformers
import os 
# 1. Подавление всех Python-предупреждений
warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
transformers.logging.set_verbosity_error()

In [3]:
DEVICE = model.device
DEVICE

device(type='cuda', index=3)

In [4]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e

In [5]:
print(model.lm_head)

Linear(in_features=3072, out_features=128256, bias=False)


## Prune model

In [6]:
import torch.nn as nn
from transformers import LlamaForCausalLM
import torch
import torch.nn as nn
from collections import Counter

def prune_layers(model, layers_to_remove):
    """
    Удаляет указанные слои из модели LlamaForCausalLM.

    Args:
        model (LlamaForCausalLM): Исходная модель.
        layers_to_remove (list): Индексы слоёв, которые нужно удалить.

    Returns:
        LlamaForCausalLM: Новая модель с удалёнными слоями.
    """
    new_model = LlamaForCausalLM(config=model.config)
    
    with torch.no_grad():
        # Копируем эмбеддинги
        new_model.model.embed_tokens.weight.data = model.model.embed_tokens.weight.data.clone()
        
        # Копируем оставшиеся слои энкодера
        new_layers = [
            layer for idx, layer in enumerate(model.model.layers) if idx not in layers_to_remove
        ]
        new_model.model.layers = torch.nn.ModuleList(new_layers)
        
        # Копируем параметры нормализации
        new_model.model.norm.weight.data = model.model.norm.weight.data.clone()
        
        # Копируем lm_head
        new_model.lm_head.weight.data = model.lm_head.weight.data.clone()

    return new_model

def reuse_previous_layers(model, layers_to_replace):
    """
    Заменяет указанные слои ссылками на предыдущие слои (повторное использование) в модели LlamaForCausalLM.

    Args:
        model (LlamaForCausalLM): Исходная модель.
        layers_to_replace (list): Индексы слоёв, которые нужно заменить ссылкой на предыдущий слой.

    Returns:
        LlamaForCausalLM: Новая модель с переиспользуемыми слоями.
    """
    new_model = LlamaForCausalLM(config=model.config)

    with torch.no_grad():
        # Копируем эмбеддинги
        new_model.model.embed_tokens.weight = model.model.embed_tokens.weight

        # Подготавливаем новые слои
        new_layers = []
        for idx, layer in enumerate(model.model.layers):
            if idx in layers_to_replace:
                if idx == 0:
                    raise ValueError("Нельзя заменить нулевой слой на предыдущий — его не существует.")
                # Используем ссылку на предыдущий слой
                new_layers.append(model.model.layers[idx - 1])
            else:
                new_layers.append(layer)

        new_model.model.layers = torch.nn.ModuleList(new_layers)

        # Копируем параметры нормализации
        new_model.model.norm.weight = model.model.norm.weight

        # Копируем lm_head
        new_model.lm_head.weight = model.lm_head.weight

    return new_model



def print_param_count_table(module, simple_module_types=(nn.Linear, nn.Embedding)):
    """
    Подсчитывает и выводит в табличной форме количество подмодулей 
    с определённым числом параметров.
    
    Args:
        module (nn.Module): модель для анализа
        simple_module_types (tuple): типы подмодулей для подсчёта параметров
    """
    param_counts = []
    for submodule in module.modules():
        if type(submodule) in simple_module_types:
            num_params = sum(p.numel() for p in submodule.parameters() if p.requires_grad)
            param_counts.append(num_params)

    total_params = sum(p.numel() for p in module.parameters() if p.requires_grad)
    
    counter = Counter(param_counts)
    
    print(f"Total trainable parameters in model: {total_params:,}\n")
    print(f"{'Parameters per submodule':>24} | {'Number of such submodules':>25}")
    print("-" * 54)
    for params_num, count in sorted(counter.items()):
        print(f"{params_num:24,} | {count:25,}")

In [7]:
print_param_count_table(model), type(model)

Total trainable parameters in model: 3,212,749,824

Parameters per submodule | Number of such submodules
------------------------------------------------------
               3,145,728 |                        56
               9,437,184 |                        56
              25,165,824 |                        84
             394,002,432 |                         2


(None, transformers.models.llama.modeling_llama.LlamaForCausalLM)

In [8]:
model_slimmed = prune_layers(model, layers_to_remove=[17])

In [9]:
print_param_count_table(model_slimmed), type(model_slimmed)

Total trainable parameters in model: 3,112,080,384

Parameters per submodule | Number of such submodules
------------------------------------------------------
               3,145,728 |                        54
               9,437,184 |                        54
              25,165,824 |                        81
             394,002,432 |                         2


(None, transformers.models.llama.modeling_llama.LlamaForCausalLM)

## Quality of answers

In [7]:
def get_pipeline(model, tokenizer = tokenizer):
    return pipeline("text-generation", model=model, tokenizer=tokenizer, device_map=model.device )

def test_quality_pipeline(pipeline, prompt = 'Hello, tell me about the situation in Russia this year'):
    print(pipeline(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"])

In [5]:
local_model_path = "/home/user31/polina/Llama-3.2-3B-Instruct"
pruned_model_path = "/home/user31/igor/Llama-3.2-3B-Instruct-pruned"

model = AutoModelForCausalLM.from_pretrained(
    pruned_model_path,
    device_map="cuda:3",
    # load_in_4bit=True,            # активируем 4-bit квантование
    torch_dtype="auto"            # автоматически выбираем тип данных
)

tokenizer = AutoTokenizer.from_pretrained(local_model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
pipe = get_pipeline(model, tokenizer = tokenizer)


In [9]:
test_quality_pipeline(pipe)

Hello, tell me about the situation in Russia this year so thought I must thank you for reporting it so I must thank you for reporting it so I must thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank t

In [10]:
test_quality_pipeline(pipe)

Hello, tell me about the situation in Russia this year so thought I must thank you for reporting it so I must thank you for reporting it so I must thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank thank t

## LORA config

## Dataset

In [17]:
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

# Токенизируем датасет
def tokenize_function(examples):
    # Устанавливаем максимальную длину и задаем параметры padding и truncation
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
def format_dataset(examples):
    # GPT-2 использует те же `input_ids` в качестве `labels`
    examples["labels"] = examples["input_ids"].copy()
    return examples

tokenized_datasets = tokenized_datasets.map(format_dataset, batched=True)

# Подготавливаем данные для тренировки
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))
full_eval_dataset = tokenized_datasets["validation"]

In [21]:
full_train_dataset.shape, small_eval_dataset.shape

((36718, 3), (100, 3))

## lora adpter

In [19]:
type(model_slimmed)

peft.peft_model.PeftModelForCausalLM

In [14]:
lora_config = LoraConfig(
    r=8,                      # размер low-rank
    lora_alpha=32,            # масштаб
    target_modules=["lm_head"],  # важно: имя слоя внутри mlp
    lora_dropout=0.05,        # dropout перед адаптером
    bias="none",              # bias не трогаем
    task_type=TaskType.CAUSAL_LM  # указываем тип задачи
)

# Применяем LoRA к модели
model_slimmed = get_peft_model(model_slimmed, lora_config)


In [15]:
type(model_slimmed)

peft.peft_model.PeftModelForCausalLM

In [None]:
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.tensorboard import SummaryWriter
import time

# Кастомный Trainer с расширенным логгированием
class IterationLimitedTrainer(Trainer):
    def __init__(self, *args, max_iterations=1000, **kwargs):
        super().__init__(*args, **kwargs)
        self.max_iterations = max_iterations
        self.writer = SummaryWriter(log_dir=self.args.logging_dir)
        self.start_time = time.time()
        
    def training_step(self, model, inputs, num_steps=None):
        # Ограничение по числу итераций
        if self.state.global_step >= self.max_iterations:
            self.control.should_training_stop = True
            return torch.tensor(0.0).to(self.args.device)
        
        # Стандартный шаг обучения
        loss = super().training_step(model, inputs, num_steps)
        
        # Логгирование на каждом шаге
        if self.state.global_step % self.args.logging_steps == 0:
            perplexity = torch.exp(loss.detach())
            step_time = time.time() - self.start_time
            
            # Основные метрики
            self.log({
                "loss": loss.item(),
                "perplexity": perplexity.item(),
                "iterations": self.state.global_step,
                "step_time": step_time
            })
            
            # TensorBoard логгирование
            self.writer.add_scalar("train/loss", loss.item(), self.state.global_step)
            self.writer.add_scalar("train/perplexity", perplexity.item(), self.state.global_step)
            self.writer.add_scalar("train/learning_rate", self._get_learning_rate(), self.state.global_step)
            
            # Сброс таймера для следующего шага
            self.start_time = time.time()
        
        return loss

# Конфигурация обучения
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    do_eval=True,
    max_steps=1000,  # Ограничение по числу итераций
    logging_dir="./logs",
    save_steps=400,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    logging_strategy="steps",
    logging_steps=50,  # Логгировать каждые 10 шагов
    eval_steps=500,     # Оценка каждые 50 шагов

    report_to="tensorboard",
    eval_strategy="steps",
    save_total_limit=2,
    gradient_accumulation_steps=2,
    warmup_steps=100,   # Прогрев для первых 100 шагов
)


# Инициализация тренера
trainer = IterationLimitedTrainer(
    model=model_slimmed,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    max_iterations=500  # Кастомное ограничение (приоритетнее max_steps)
)

# Запуск обучения
try:
    results = trainer.train()
except KeyboardInterrupt:
    print("Обучение прервано пользователем")
finally:
    # Сохранение финальной модели
    trainer.save_model("final_model")
    trainer.writer.close()  # Важно закрыть writer

{'loss': 6.950817584991455, 'perplexity': 1044.0029296875, 'iterations': 0, 'step_time': 3.06370210647583, 'epoch': 0}
{'loss': 6.7499260902404785, 'perplexity': 853.9956665039062, 'iterations': 0, 'step_time': 1.3029866218566895, 'epoch': 0}
{'loss': 6.3779, 'grad_norm': 0.5703125, 'learning_rate': 9.8e-05, 'epoch': 0.005446919766871834}
{'loss': 0.27523621916770935, 'perplexity': 1.316841721534729, 'iterations': 50, 'step_time': 155.27723860740662, 'epoch': 0.005446919766871834}
{'loss': 0.021796170622110367, 'perplexity': 1.0220354795455933, 'iterations': 50, 'step_time': 1.3078052997589111, 'epoch': 0.005446919766871834}
{'loss': 0.3054, 'grad_norm': 1.1640625, 'learning_rate': 0.00019800000000000002, 'epoch': 0.010893839533743668}
{'loss': 0.10975276678800583, 'perplexity': 1.116002082824707, 'iterations': 100, 'step_time': 155.51432919502258, 'epoch': 0.010893839533743668}
{'loss': 0.007229093462228775, 'perplexity': 1.0072553157806396, 'iterations': 100, 'step_time': 1.285993337

In [17]:
%load_ext tensorboard


In [19]:
%tensorboard --logdir ./logs

Reusing TensorBoard on port 6006 (pid 319862), started 0:00:04 ago. (Use '!kill 319862' to kill it.)

In [23]:
model_path = './output/checkpoint-375/'
trained_model_slimmed = AutoModelForCausalLM.from_pretrained(model_path , device_map="cuda:3")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
pipe =  get_pipeline(trained_model_slimmed, tokenizer = tokenizer)



Hello everyone! I'm excited to share with you my latest project, a new recipe for a delicious and healthy dessert that I like to call "Berry Bliss Bites." These bite-sized treats are perfect for hot summer days when you need a sweet and refreshing pick-me-up.

**Berry Bliss Bites Recipe**

**Ingredients:**

- 1 cup rolled oats
- 1/2 cup almond butter
- 1/4 cup honey
- 1/4 cup chopped fresh berries (such as blueberries, strawberries, or raspberries)
- 1/4 cup chopped nuts (such as almonds or walnuts)
- 1/4 cup shredded coconut (optional)
- Pinch of salt

**Instructions:**

1. In a large mixing bowl, combine the oats and almond butter. Mix until well combined and a dough forms.
2. Add the honey and mix until the dough is smooth and creamy.
3. Fold in the chopped berries and nuts.
4. If using


In [28]:
test_quality_pipeline(pipe, prompt = 'Hello, how are you?')


Hello, how are you? 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

