<a href="https://colab.research.google.com/github/RomGor1/Methods-of-semantic-information-processing/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
import numpy as np
import random

# Гиперпараметры для маленького датасета
batch_size = 4
block_size = 8  # очень маленький, чтобы работало на наших данных
max_iters = 200
eval_interval = 20
learning_rate = 1e-3
eval_iters = 5
n_embd = 32
n_head = 2
n_layer = 2
dropout = 0.0

# Увеличим текст для обучения
text = """
Вот пример текста для обучения нашей модели GPT.
Этот текст будет использован для создания словаря и обучения.
Чем больше текста, тем лучше будет работать модель.
Попробуем сгенерировать что-нибудь интересное.
И еще немного текста для разнообразия данных.
И еще
И еще
Еще
Сколько еще нужно?.
"""
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Создание словарей
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Подготовка данных
data = np.array(encode(text))
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    max_pos = max(1, len(data) - block_size)  # гарантируем хотя бы 1
    ix = np.random.randint(0, max_pos, size=(batch_size,))
    x = np.stack([data[i:i+block_size] for i in ix])
    y = np.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def cross_entropy(logits, targets):
    probs = softmax(logits)
    loss = -np.log(probs[np.arange(len(targets)), targets]).mean()
    return loss



In [115]:
class Head:
    def __init__(self, head_size):
        self.key = np.random.randn(n_embd, head_size) * 0.01
        self.query = np.random.randn(n_embd, head_size) * 0.01
        self.value = np.random.randn(n_embd, head_size) * 0.01
        self.tril = np.tril(np.ones((block_size, block_size)))

    def __call__(self, x):
        B,T,C = x.shape
        k = x @ self.key
        q = x @ self.query

        # Исправленное вычисление внимания
        wei = q @ k.transpose(0,2,1) * C**-0.5  # (B,T,T)
        wei = np.where(self.tril[:T,:T] == 0, -np.inf, wei)
        wei = softmax(wei)

        v = x @ self.value
        out = wei @ v
        return out


In [116]:
class MultiHeadAttention:
    def __init__(self, num_heads, head_size):
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = np.random.randn(num_heads * head_size, n_embd) * 0.01

    def __call__(self, x):
        out = np.concatenate([h(x) for h in self.heads], axis=-1)
        out = out @ self.proj
        return out

In [117]:
class FeedForward:
    def __init__(self, n_embd):
        self.net = [
            Linear(n_embd, 4 * n_embd),
            ReLU(),
            Linear(4 * n_embd, n_embd)
        ]

    def __call__(self, x):
        for layer in self.net:
            x = layer(x)
        return x


In [118]:
class Linear:
    def __init__(self, in_features, out_features):
        self.weight = np.random.randn(in_features, out_features) * 0.01
        self.bias = np.zeros(out_features)

    def __call__(self, x):
        return x @ self.weight + self.bias


In [119]:
class ReLU:
    def __call__(self, x):
        return np.maximum(0, x)

In [120]:
class LayerNorm:
    def __init__(self, dim):
        self.gamma = np.ones(dim)
        self.beta = np.zeros(dim)

    def __call__(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        std = x.std(axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + 1e-5) + self.beta


In [121]:
class Block:
    def __init__(self, n_embd, n_head):
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = LayerNorm(n_embd)
        self.ln2 = LayerNorm(n_embd)

    def __call__(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class AdamW:
    def __init__(self, params, lr=1e-3):
        self.params = params
        self.lr = lr

    def step(self, grads):
        for p, g in zip(self.params, grads):
            p -= self.lr * g


In [122]:
class GPT:
    def __init__(self):
        self.token_embedding = np.random.randn(vocab_size, n_embd) * 0.01
        self.position_embedding = np.random.randn(block_size, n_embd) * 0.01
        self.blocks = [Block(n_embd, n_head) for _ in range(n_layer)]
        self.ln_f = LayerNorm(n_embd)
        self.lm_head = np.random.randn(n_embd, vocab_size) * 0.01

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding[idx]  # (B,T,C)
        pos_emb = self.position_embedding[np.arange(T)]  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)

        for block in self.blocks:
            x = block(x)

        x = self.ln_f(x)
        logits = x @ self.lm_head  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            loss = cross_entropy(logits.reshape(-1, logits.shape[-1]), targets.reshape(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = softmax(logits)
            idx_next = np.array([np.random.choice(vocab_size, p=p) for p in probs])
            idx = np.column_stack((idx, idx_next))
        return idx

In [123]:
# Инициализация и обучение
model = GPT()
params = [model.token_embedding, model.position_embedding, model.lm_head]
for block in model.blocks:
    params.extend([block.sa.proj] + [h.key for h in block.sa.heads] +
                 [h.query for h in block.sa.heads] + [h.value for h in block.sa.heads])
    params.extend([layer.weight for layer in block.ffwd.net if isinstance(layer, Linear)])

optimizer = AdamW(params, lr=learning_rate)

print(f"Размер словаря: {vocab_size}")
print(f"Длина обучающих данных: {len(train_data)}")
print(f"Длина валидационных данных: {len(val_data)}")

Размер словаря: 43
Длина обучающих данных: 236
Длина валидационных данных: 60


In [124]:
for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = {}
        for split in ['train', 'val']:
            try:
                batch_losses = []
                for _ in range(eval_iters):
                    xb, yb = get_batch(split)
                    _, loss = model.forward(xb, yb)
                    batch_losses.append(loss)
                losses[split] = np.mean(batch_losses)
            except:
                losses[split] = float('inf')
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    try:
        xb, yb = get_batch('train')
        _, loss = model.forward(xb, yb)

        # Упрощенный backward pass (для демонстрации)
        grads = [0.01 * np.random.randn(*p.shape) for p in params]  # в реальности нужно вычислять градиенты
        optimizer.step(grads)
    except Exception as e:
        print(f"Ошибка на итерации {iter}: {str(e)}")
        break

step 0: train loss 3.7676, val loss 3.7441
step 20: train loss 3.7645, val loss 3.7411
step 40: train loss 3.7557, val loss 3.7425
step 60: train loss 3.7693, val loss 3.7493
step 80: train loss 3.7672, val loss 3.7420
step 100: train loss 3.7625, val loss 3.7418
step 120: train loss 3.7656, val loss 3.7384
step 140: train loss 3.7592, val loss 3.7496
step 160: train loss 3.7588, val loss 3.7541
step 180: train loss 3.7593, val loss 3.7380


In [125]:
# Генерация текста
print("\nСгенерированный текст:")
context = np.array([[stoi['В']]])  # Начинаем с символа 'В'
print(decode(model.generate(context, max_new_tokens=50)[0]))


Сгенерированный текст:
Вуй.
бPаж,и.,ИчаСз
И
п.бнзтжяпнлВ?сщ-мххпв?.пЕВщзхн
