<a href="https://colab.research.google.com/github/RomGor1/Methods-of-semantic-information-processing/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import numpy as np

class SimpleGPT:
    def __init__(self, vocab_size, embedding_dim, context_length):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_length = context_length


        self.token_embedding = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.position_embedding = np.random.randn(context_length, embedding_dim) * 0.01
        self.lm_head = np.random.randn(embedding_dim, vocab_size) * 0.01

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    def cross_entropy_loss(self, probs, targets):
        m = targets.shape[0]
        log_probs = -np.log(probs[np.arange(m), targets])
        return np.sum(log_probs) / m

    def forward(self, inputs, targets=None):
        batch_size = inputs.shape[0]


        tok_emb = self.token_embedding[inputs]  # (batch_size, context_length, embedding_dim)
        pos_emb = self.position_embedding[np.arange(self.context_length)]  # (context_length, embedding_dim)
        x = tok_emb + pos_emb  # (batch_size, context_length, embedding_dim)


        logits = np.dot(x, self.lm_head)  # (batch_size, context_length, vocab_size)

        if targets is None:
            loss = None
        else:

            logits_flat = logits.reshape(-1, self.vocab_size)
            targets_flat = targets.reshape(-1)
            probs = self.softmax(logits_flat)
            loss = self.cross_entropy_loss(probs, targets_flat)

        return logits, loss

    def backward(self, inputs, targets, learning_rate):
        batch_size = inputs.shape[0]


        logits, loss = self.forward(inputs, targets)


        logits_flat = logits.reshape(-1, self.vocab_size)
        targets_flat = targets.reshape(-1)
        probs = self.softmax(logits_flat)

        d_logits_flat = probs.copy()
        d_logits_flat[np.arange(len(targets_flat)), targets_flat] -= 1
        d_logits_flat /= batch_size
        d_logits = d_logits_flat.reshape(batch_size, self.context_length, self.vocab_size)


        x = self.token_embedding[inputs] + self.position_embedding[np.arange(self.context_length)]
        d_lm_head = np.zeros_like(self.lm_head)


        for b in range(batch_size):
            for t in range(self.context_length):
                d_lm_head += np.outer(x[b, t], d_logits[b, t])


        d_x = np.dot(d_logits, self.lm_head.T)
        d_token_embedding = np.zeros_like(self.token_embedding)

        for b in range(batch_size):
            for t in range(self.context_length):
                d_token_embedding[inputs[b, t]] += d_x[b, t]

        d_position_embedding = d_x.sum(axis=0)

        self.lm_head -= learning_rate * d_lm_head
        self.token_embedding -= learning_rate * d_token_embedding
        self.position_embedding -= learning_rate * d_position_embedding

        return loss

    def train(self, inputs, targets, epochs, learning_rate):
        for epoch in range(epochs):
            loss = self.backward(inputs, targets, learning_rate)
            if epoch % 100 == 0:
                print(f'Epoch {epoch}, Loss: {loss:.4f}')

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[-self.context_length:]
            logits, _ = self.forward(idx_cond[np.newaxis, :])
            logits = logits[0, -1, :]
            probs = self.softmax(logits)
            idx_next = np.random.choice(self.vocab_size, p=probs)
            idx = np.append(idx, idx_next)
        return idx




In [71]:

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [72]:

data = np.array(encode(text))
context_length = 3
X = []
y = []
for i in range(len(data) - context_length):
    X.append(data[i:i+context_length])
    y.append(data[i+context_length])
X = np.array(X)
y = np.array(y)

In [73]:

gpt = SimpleGPT(vocab_size=vocab_size, embedding_dim=8, context_length=context_length)
gpt.train(X, y, epochs=1000, learning_rate=0.1)

Epoch 0, Loss: 2.0795
Epoch 100, Loss: 1.9978


  log_probs = -np.log(probs[np.arange(m), targets])


Epoch 200, Loss: inf
Epoch 300, Loss: inf
Epoch 400, Loss: inf
Epoch 500, Loss: inf
Epoch 600, Loss: inf
Epoch 700, Loss: inf
Epoch 800, Loss: inf
Epoch 900, Loss: inf


In [76]:

print("\nGenerated text:")
start = np.array(encode("hello"))[:context_length]
generated = gpt.generate(start, max_new_tokens=10)
print(decode(generated))


Generated text:
heleeeeeeeeee


In [85]:
import numpy as np
import random

# Гиперпараметры для маленького датасета
batch_size = 4
block_size = 8  # очень маленький, чтобы работало на наших данных
max_iters = 200
eval_interval = 20
learning_rate = 1e-3
eval_iters = 5
n_embd = 32
n_head = 2
n_layer = 2
dropout = 0.0

# Увеличим текст для обучения
text = """
Вот пример текста для обучения нашей модели GPT.
Этот текст будет использован для создания словаря и обучения.
Чем больше текста, тем лучше будет работать модель.
Попробуем сгенерировать что-нибудь интересное.
И еще немного текста для разнообразия данных.
Надеюсь, этого хватит для базовой демонстрации.
"""
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Создание словарей
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Подготовка данных
data = np.array(encode(text))
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    max_pos = max(1, len(data) - block_size)  # гарантируем хотя бы 1
    ix = np.random.randint(0, max_pos, size=(batch_size,))
    x = np.stack([data[i:i+block_size] for i in ix])
    y = np.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def cross_entropy(logits, targets):
    probs = softmax(logits)
    loss = -np.log(probs[np.arange(len(targets)), targets]).mean()
    return loss

class Head:
    def __init__(self, head_size):
        self.key = np.random.randn(n_embd, head_size) * 0.01
        self.query = np.random.randn(n_embd, head_size) * 0.01
        self.value = np.random.randn(n_embd, head_size) * 0.01
        self.tril = np.tril(np.ones((block_size, block_size)))

    def __call__(self, x):
        B,T,C = x.shape
        k = x @ self.key
        q = x @ self.query

        # Исправленное вычисление внимания
        wei = q @ k.transpose(0,2,1) * C**-0.5  # (B,T,T)
        wei = np.where(self.tril[:T,:T] == 0, -np.inf, wei)
        wei = softmax(wei)

        v = x @ self.value
        out = wei @ v
        return out

class MultiHeadAttention:
    def __init__(self, num_heads, head_size):
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = np.random.randn(num_heads * head_size, n_embd) * 0.01

    def __call__(self, x):
        out = np.concatenate([h(x) for h in self.heads], axis=-1)
        out = out @ self.proj
        return out

class FeedForward:
    def __init__(self, n_embd):
        self.net = [
            Linear(n_embd, 4 * n_embd),
            ReLU(),
            Linear(4 * n_embd, n_embd)
        ]

    def __call__(self, x):
        for layer in self.net:
            x = layer(x)
        return x

class Linear:
    def __init__(self, in_features, out_features):
        self.weight = np.random.randn(in_features, out_features) * 0.01
        self.bias = np.zeros(out_features)

    def __call__(self, x):
        return x @ self.weight + self.bias

class ReLU:
    def __call__(self, x):
        return np.maximum(0, x)

class LayerNorm:
    def __init__(self, dim):
        self.gamma = np.ones(dim)
        self.beta = np.zeros(dim)

    def __call__(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        std = x.std(axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + 1e-5) + self.beta

class Block:
    def __init__(self, n_embd, n_head):
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = LayerNorm(n_embd)
        self.ln2 = LayerNorm(n_embd)

    def __call__(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class AdamW:
    def __init__(self, params, lr=1e-3):
        self.params = params
        self.lr = lr

    def step(self, grads):
        for p, g in zip(self.params, grads):
            p -= self.lr * g

class GPT:
    def __init__(self):
        self.token_embedding = np.random.randn(vocab_size, n_embd) * 0.01
        self.position_embedding = np.random.randn(block_size, n_embd) * 0.01
        self.blocks = [Block(n_embd, n_head) for _ in range(n_layer)]
        self.ln_f = LayerNorm(n_embd)
        self.lm_head = np.random.randn(n_embd, vocab_size) * 0.01

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding[idx]  # (B,T,C)
        pos_emb = self.position_embedding[np.arange(T)]  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)

        for block in self.blocks:
            x = block(x)

        x = self.ln_f(x)
        logits = x @ self.lm_head  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            loss = cross_entropy(logits.reshape(-1, logits.shape[-1]), targets.reshape(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = softmax(logits)
            idx_next = np.array([np.random.choice(vocab_size, p=p) for p in probs])
            idx = np.column_stack((idx, idx_next))
        return idx

# Инициализация и обучение
model = GPT()
params = [model.token_embedding, model.position_embedding, model.lm_head]
for block in model.blocks:
    params.extend([block.sa.proj] + [h.key for h in block.sa.heads] +
                 [h.query for h in block.sa.heads] + [h.value for h in block.sa.heads])
    params.extend([layer.weight for layer in block.ffwd.net if isinstance(layer, Linear)])

optimizer = AdamW(params, lr=learning_rate)

print(f"Размер словаря: {vocab_size}")
print(f"Длина обучающих данных: {len(train_data)}")
print(f"Длина валидационных данных: {len(val_data)}")

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = {}
        for split in ['train', 'val']:
            try:
                batch_losses = []
                for _ in range(eval_iters):
                    xb, yb = get_batch(split)
                    _, loss = model.forward(xb, yb)
                    batch_losses.append(loss)
                losses[split] = np.mean(batch_losses)
            except:
                losses[split] = float('inf')
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    try:
        xb, yb = get_batch('train')
        _, loss = model.forward(xb, yb)

        # Упрощенный backward pass (для демонстрации)
        grads = [0.01 * np.random.randn(*p.shape) for p in params]  # в реальности нужно вычислять градиенты
        optimizer.step(grads)
    except Exception as e:
        print(f"Ошибка на итерации {iter}: {str(e)}")
        break

# Генерация текста
print("\nСгенерированный текст:")
context = np.array([[stoi['В']]])  # Начинаем с символа 'В'
print(decode(model.generate(context, max_new_tokens=50)[0]))

Размер словаря: 43
Длина обучающих данных: 244
Длина валидационных данных: 62
step 0: train loss 3.7692, val loss 3.7629
step 20: train loss 3.7667, val loss 3.7653
step 40: train loss 3.7682, val loss 3.7600
step 60: train loss 3.7678, val loss 3.7579
step 80: train loss 3.7664, val loss 3.7575
step 100: train loss 3.7727, val loss 3.7645
step 120: train loss 3.7745, val loss 3.7586
step 140: train loss 3.7789, val loss 3.7567
step 160: train loss 3.7753, val loss 3.7611
step 180: train loss 3.7692, val loss 3.7589

Сгенерированный текст:
ВTэTЭрчэдЭныгагПздс
дцвгерхTьTЭнВ.тмьшПT эйЭхПь,Чщ 
