<a href="https://colab.research.google.com/github/RomGor1/Methods-of-semantic-information-processing/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import numpy as np

class SimpleGPT:
    def __init__(self, vocab_size, embedding_dim, context_length):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_length = context_length

        # Инициализация параметров
        self.token_embedding = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.position_embedding = np.random.randn(context_length, embedding_dim) * 0.01
        self.lm_head = np.random.randn(embedding_dim, vocab_size) * 0.01

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    def cross_entropy_loss(self, probs, targets):
        m = targets.shape[0]
        log_probs = -np.log(probs[np.arange(m), targets])
        return np.sum(log_probs) / m

    def forward(self, inputs, targets=None):
        batch_size = inputs.shape[0]

        # Token and position embeddings
        tok_emb = self.token_embedding[inputs]  # (batch_size, context_length, embedding_dim)
        pos_emb = self.position_embedding[np.arange(self.context_length)]  # (context_length, embedding_dim)
        x = tok_emb + pos_emb  # (batch_size, context_length, embedding_dim)

        # Linear projection
        logits = np.dot(x, self.lm_head)  # (batch_size, context_length, vocab_size)

        if targets is None:
            loss = None
        else:
            # Calculate loss
            logits_flat = logits.reshape(-1, self.vocab_size)
            targets_flat = targets.reshape(-1)
            probs = self.softmax(logits_flat)
            loss = self.cross_entropy_loss(probs, targets_flat)

        return logits, loss

    def backward(self, inputs, targets, learning_rate):
        batch_size = inputs.shape[0]

        # Forward pass
        logits, loss = self.forward(inputs, targets)

        # Gradient of softmax + cross entropy
        logits_flat = logits.reshape(-1, self.vocab_size)
        targets_flat = targets.reshape(-1)
        probs = self.softmax(logits_flat)

        d_logits_flat = probs.copy()
        d_logits_flat[np.arange(len(targets_flat)), targets_flat] -= 1
        d_logits_flat /= batch_size
        d_logits = d_logits_flat.reshape(batch_size, self.context_length, self.vocab_size)

        # Gradient for lm_head
        x = self.token_embedding[inputs] + self.position_embedding[np.arange(self.context_length)]
        d_lm_head = np.zeros_like(self.lm_head)

        # Manually compute the gradient for lm_head to avoid dimension issues
        for b in range(batch_size):
            for t in range(self.context_length):
                d_lm_head += np.outer(x[b, t], d_logits[b, t])

        # Gradient for embeddings
        d_x = np.dot(d_logits, self.lm_head.T)
        d_token_embedding = np.zeros_like(self.token_embedding)

        for b in range(batch_size):
            for t in range(self.context_length):
                d_token_embedding[inputs[b, t]] += d_x[b, t]

        d_position_embedding = d_x.sum(axis=0)

        # Update parameters
        self.lm_head -= learning_rate * d_lm_head
        self.token_embedding -= learning_rate * d_token_embedding
        self.position_embedding -= learning_rate * d_position_embedding

        return loss

    def train(self, inputs, targets, epochs, learning_rate):
        for epoch in range(epochs):
            loss = self.backward(inputs, targets, learning_rate)
            if epoch % 100 == 0:
                print(f'Epoch {epoch}, Loss: {loss:.4f}')

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[-self.context_length:]
            logits, _ = self.forward(idx_cond[np.newaxis, :])
            logits = logits[0, -1, :]
            probs = self.softmax(logits)
            idx_next = np.random.choice(self.vocab_size, p=probs)
            idx = np.append(idx, idx_next)
        return idx




In [33]:
# Example usage
text = "hello world hello hello world"
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [34]:
# Create mappings
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [35]:
# Prepare data
data = np.array(encode(text))
context_length = 3
X = []
y = []
for i in range(len(data) - context_length):
    X.append(data[i:i+context_length])
    y.append(data[i+context_length])
X = np.array(X)
y = np.array(y)

In [36]:
# Create and train model
gpt = SimpleGPT(vocab_size=vocab_size, embedding_dim=8, context_length=context_length)
gpt.train(X, y, epochs=1000, learning_rate=0.1)

Epoch 0, Loss: 2.0795
Epoch 100, Loss: 1.9875
Epoch 200, Loss: inf
Epoch 300, Loss: inf


  log_probs = -np.log(probs[np.arange(m), targets])


Epoch 400, Loss: inf
Epoch 500, Loss: inf
Epoch 600, Loss: inf
Epoch 700, Loss: inf
Epoch 800, Loss: inf
Epoch 900, Loss: inf


In [37]:
# Generate text
print("\nGenerated text:")
start = np.array(encode("hel"))[:context_length]  # Ensure correct length
generated = gpt.generate(start, max_new_tokens=10)
print(decode(generated))


Generated text:
hel          
