<a href="https://colab.research.google.com/github/SavageGinny/MLP-Jupiters/blob/main/lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Загружаем библиотеки

In [1]:
import numpy as np
import pandas as pd
import re
import random
import math
from collections import Counter

**Обработчик текста**

In [2]:
class TextProcessor:
    def __init__(self):
        self.vocab = {}
        self.word2idx = {}
        self.idx2word = {}

    def tokenize(self, text):
        text = text.lower()
        tokens = re.findall(r'\b\w+\b', text, re.UNICODE)
        return tokens

    def lemmatize(self, tokens):
        # Простая псевдо-лемматизация (можно заменить словарем)
        lemmas = [re.sub(r'(ами|ами|ами|ов|ев|ий|ый|ой|ого|ому|ым|ах|ях|е|у|ю|а|о|ы|и|я)$', '', word) for word in tokens]
        return lemmas

    def build_vocab(self, tokens):
        self.vocab = Counter(tokens)
        self.word2idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx2word = {i: word for word, i in self.word2idx.items()}

    def encode(self, tokens):
        return [self.word2idx[word] for word in tokens if word in self.word2idx]

    def decode(self, indices):
        return [self.idx2word[i] for i in indices]



**GPT нейросеть**

In [3]:
class SimpleGPT:
    def __init__(self, vocab_size, hidden_dim):
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.W1 = np.random.randn(vocab_size, hidden_dim) * 0.01
        self.W2 = np.random.randn(hidden_dim, vocab_size) * 0.01

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def train(self, X, y, epochs=100, lr=0.01):
        for epoch in range(epochs):
            loss = 0
            for context, target in zip(X, y):
                x_vec = np.sum(self.W1[context], axis=0)
                h = x_vec / len(context)
                u = np.dot(h, self.W2)
                y_pred = self.softmax(u)

                loss += -np.log(y_pred[target])

                # Градиенты
                e = y_pred
                e[target] -= 1

                dW2 = np.outer(h, e)
                dW1 = np.zeros_like(self.W1)
                for idx in context:
                    dW1[idx] += np.dot(self.W2, e) / len(context)

                # Обновление весов
                self.W1 -= lr * dW1
                self.W2 -= lr * dW2

            if epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {loss}")

    def predict(self, context):
        x_vec = np.sum(self.W1[context], axis=0)
        h = x_vec / len(context)
        u = np.dot(h, self.W2)
        y_pred = self.softmax(u)
        return np.argmax(y_pred)

    def generate(self, start_context, n_words=10):
        generated = start_context[:]
        for _ in range(n_words):
            idxs = self.encode(generated[-2:])  # bigram context
            if len(idxs) < 2:
                break
            next_idx = self.predict(idxs)
            generated.append(self.idx2word[next_idx])
        return ' '.join(generated)

    def encode(self, tokens):
        return [self.word2idx.get(t, 0) for t in tokens]

**Загрузка датасета**

In [7]:
with open("naruto_full_dataset.txt", encoding='utf-8') as f:
    text = f.read()

**Подготовка данных**


In [8]:
tp = TextProcessor()
tokens = tp.tokenize(text)
lemmas = tp.lemmatize(tokens)
tp.build_vocab(lemmas)

window_size = 2
data = []
for i in range(window_size, len(lemmas) - window_size):
    context = lemmas[i - window_size:i] + lemmas[i + 1:i + window_size + 1]
    target = lemmas[i]
    data.append((context, target))


X = [tp.encode(context) for context, target in data]
y = [tp.word2idx[target] for context, target in data]

**Обучение модели**

In [29]:
gpt = SimpleGPT(vocab_size=len(tp.vocab), hidden_dim=50)
gpt.word2idx = tp.word2idx
gpt.idx2word = tp.idx2word
gpt.train(X, y, epochs=500, lr=0.1)

Epoch 0, Loss: 11108.092303630261
Epoch 10, Loss: 2162.191874176721
Epoch 20, Loss: 125.99707077306324
Epoch 30, Loss: 46.20133178103384
Epoch 40, Loss: 26.985535773563473
Epoch 50, Loss: 18.707313462763402
Epoch 60, Loss: 14.172267630043033
Epoch 70, Loss: 11.335368763444858
Epoch 80, Loss: 9.404287857531752
Epoch 90, Loss: 8.01042730232998
Epoch 100, Loss: 6.9600241524079705
Epoch 110, Loss: 6.141864463320907
Epoch 120, Loss: 5.487729684960028
Epoch 130, Loss: 4.953537829603535
Epoch 140, Loss: 4.509583149350543
Epoch 150, Loss: 4.13514406141754
Epoch 160, Loss: 3.8153413682507926
Epoch 170, Loss: 3.539224664673514
Epoch 180, Loss: 3.298562257018554
Epoch 190, Loss: 3.087050938519558
Epoch 200, Loss: 2.8997853420542588
Epoch 210, Loss: 2.7328927851962557
Epoch 220, Loss: 2.583276510616129
Epoch 230, Loss: 2.4484316415781566
Epoch 240, Loss: 2.3263109683301444
Epoch 250, Loss: 2.215225543038686
Epoch 260, Loss: 2.1137700135125868
Epoch 270, Loss: 2.0207658169413554
Epoch 280, Loss: 1.

**Запуск**

In [34]:
user_input = input("Введите начальные 1-2 слова: ").strip().lower()
start_tokens = tp.tokenize(user_input)
start_lemmas = tp.lemmatize(start_tokens)
start_known = [lemma for lemma in start_lemmas if lemma in tp.word2idx]

if len(start_known) < 2:
    print("Недостаточно известных слов в вводе. Пожалуйста, введите минимум два известных слова.")
else:
    print("Сгенерированный текст:")
    print(gpt.generate(start_known, n_words=15))

Введите начальные 1-2 слова: Чакра — это энергия
Сгенерированный текст:
чакр эт энерги чакр природ тр чакр природ тр чакр природ тр чакр природ тр чакр природ тр
