# **Импорты**

In [14]:
!pip install wandb ruff

Collecting ruff
  Downloading ruff-0.4.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ruff
Successfully installed ruff-0.4.1


In [15]:
from google.colab import userdata

userdata.get("HF_TOKEN")

'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'

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import get_linear_schedule_with_warmup

from tqdm import tqdm

import wandb

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

# для теста
# from datasets import load_dataset

from torch.amp import autocast

In [17]:
# from huggingface_hub import notebook_login

# notebook_login()

# **Датасет - чек**

In [None]:
train_data = load_dataset("SirNeural/flan_v2", split="train")

In [None]:
emb = []
for i in tqdm(range(len(train_data))):
    inputs = train_data[i]["inputs"]
    targets = train_data[i]["targets"]

    train_row = f"{inputs}, {targets}, {tokenizer.eos_token}"
    embeded_row = tokenizer.encode(
        train_row,
        padding="max_length",
        max_length=1024,
        truncation=True,
        return_tensors="pt",
    )

    emb += [embeded_row]
emb[:5]

100%|██████████| 100000/100000 [05:02<00:00, 330.48it/s]


[tensor([[   2,    5, 1532,  ..., 4370, 2461,    3]]),
 tensor([[   2,    5, 1532,  ...,    0,    0,    0]]),
 tensor([[    2,     5, 10126,  ...,     5,   207,     3]]),
 tensor([[   2, 6330,  244,  ..., 2864,    5,    3]]),
 tensor([[   2,    5, 1532,  ...,    0,    0,    0]])]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(30000, 2048)

In [None]:
emb_dataset = torch.utils.data.ConcatDataset([emb])
train_dataset = DataLoader(emb_dataset, batch_size=16, shuffle=True)

# **LLM**

In [None]:
checkpoint = "EleutherAI/pythia-1b"
tok = AutoTokenizer.from_pretrained(checkpoint)
mod = AutoModelForCausalLM.from_pretrained(checkpoint)
mod

In [None]:
class EMA(nn.Module):
    def __init__(self, decay: float):
        super().__init__()
        self.decay = decay
        self.shadow_params = {}

    def forward(self, model: nn.Module):
        for name, params in model.named_parameters():
            if params.requires_grad:
                if name not in self.shadow_params:
                    self.shadow_params[name] = params.data.clone()
                else:
                    # shadow_variable -= (1 - decay) * (shadow_variable - variable)
                    self.shadow_params[name] -= (1 - self.decay) * (
                        self.shadow_params[name] - params
                    )
                params.data = self.shadow_params[name]


ema = EMA(0.5)

In [None]:
def freeze(model: nn.Module):
    for param in model.parameters():
        param.requires_grad = False

In [None]:
training_steps = 1000
optimizer = AdamW(mod.parameters())
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=training_steps // 10, num_training_steps=training_steps
)

wandb.login(key=userdata.get("WANDB_KEY"), relogin=True)
wandb.init(sync_tensorboard=True, name="test", project="hse-project", entity="aid_")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
mod.to(device)

freeze(mod)


def one_epoch(model, data):
    model.train()

    for batch in data:
        batch = batch.view(batch.shape[0], batch.shape[-1])

        t = batch.to(device)

        optimizer.zero_grad()

        with autocast(device_type="cuda"):
            loss = model(input_ids=t, labels=t)["loss"]
            wandb.log({"loss": loss})

        loss.backward()
        optimizer.step()
        scheduler.step()
        # ema(model)

    model.eval()

# **Experiments with diff LLM**

**Gemma-2b**

In [None]:
checkpoint = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

input_text = "Какого ."
input_ids = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))

del tokenizer
del model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



<bos>Write me a poem about Machine Learning.

I’m not sure what you mean by “


**Phi-1.5**

In [12]:
checkpoint = "microsoft/phi-1_5"

# config = AutoConfig.from_pretrained(checkpoint, max_new_tokens = 128)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

input_text = "How many legs does a horse have?"
input_ids = tokenizer(input_text, return_tensors="pt")

print(input_ids)

outputs = model.generate(**input_ids, max_new_tokens=32)

print(outputs)

print(tokenizer.decode(outputs[0]))

{'input_ids': tensor([[2437,  867, 7405,  857,  257, 8223,  423,   30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[ 2437,   867,  7405,   857,   257,  8223,   423,    30,   198, 33706,
            25,   317,  8223,   468,  1440,  7405,    13,   198,   198,  3109,
         23697,   513,    25,  1867,   318,   262,  4007,   286,   257,  8223,
           338,  7894,    30,   198, 33706,    25,   317,  8223,   338,  7894]])
How many legs does a horse have?
Answer: A horse has four legs.

Exercise 3: What is the purpose of a horse's tail?
Answer: A horse's tail


In [10]:
rus_inp_text = "Какого цвета небо?"
rus_ids = tokenizer(rus_inp_text, return_tensors="pt")
rus_out = model.generate(**rus_ids, max_length=256)

In [11]:
print(tokenizer.decode(rus_out[0]))

Какого цвета небо?
    # Проверка не принимается в простоте
    if not is_valid_number(number):
        raise ValueError("Invalid number")

    # Проверка не принимается в простоте
    if not is_valid_number(number, 2):
        raise ValueError("Invalid number")

    # Проверка не принимается в простоте
    if not is_valid_number(number, 3):
        raise ValueError("Invalid number")

    # Проверка не принимается в простоте
    if not is_valid_number(number, 4):
        


In [13]:
del tokenizer
del model

# **Image encoders**