<a href="https://colab.research.google.com/github/SavageGinny/MLP-Jupiters/blob/main/lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Загружаем библиотеки

In [20]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt

In [25]:
!pip install typing-extensions



In [29]:
import inspect
if not hasattr(inspect, 'getargspec'):
    import collections
    def getargspec(func):
        sig = inspect.signature(func)
        args = [
            p.name for p in sig.parameters.values()
            if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
        ]
        varargs = None
        varkw = None
        defaults = tuple(
            p.default for p in sig.parameters.values()
            if p.default is not p.empty
        ) or None
        return collections.namedtuple('ArgSpec', 'args varargs keywords defaults')(
            args, varargs, varkw, defaults
        )
    inspect.getargspec = getargspec

In [30]:
import numpy as np
import pandas as pd
import re
import random
import pymorphy2
import math
from collections import Counter

**Обработчик текста**

In [31]:
class TextProcessor:
    def __init__(self):
        self.vocab = {}
        self.word2idx = {}
        self.idx2word = {}
        self.morph = pymorphy2.MorphAnalyzer()

    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r'[!?.,:;\-—"“”\(\)\[\]{}<>«»]', '', text)
        tokens = re.findall(r'\b\w+\b', text, re.UNICODE)
        return tokens

    def lemmatize(self, tokens):
        return [self.morph.parse(word)[0].normal_form for word in tokens]

    def build_vocab(self, tokens):
        self.vocab = Counter(tokens)
        self.word2idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx2word = {i: word for word, i in self.word2idx.items()}

    def encode(self, tokens):
        return [self.word2idx[word] for word in tokens if word in self.word2idx]

    def decode(self, indices):
        return [self.idx2word[i] for i in indices]



**GPT нейросеть**

In [47]:
class SimpleGPT:
    def __init__(self, vocab_size, hidden_dim):
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.U = np.random.randn(hidden_dim, vocab_size) * 0.01  # input -> hidden
        self.W = np.random.randn(hidden_dim, hidden_dim) * 0.01  # hidden -> hidden
        self.V = np.random.randn(vocab_size, hidden_dim) * 0.01  # hidden -> output

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / np.sum(e_x)

    def forward(self, inputs):
        h = np.zeros((len(inputs) + 1, self.hidden_dim))
        for t in range(len(inputs)):
            x_t = np.zeros(self.vocab_size)
            x_t[inputs[t]] = 1
            h[t + 1] = np.tanh(np.dot(self.U, x_t) + np.dot(self.W, h[t]))
        y = np.dot(self.V, h[len(inputs)])
        return y, h

    def train(self, sequences, targets, epochs=10, lr=0.01):
        for epoch in range(epochs):
            loss = 0
            for seq, target in zip(sequences, targets):
                y_pred, h = self.forward(seq)
                probs = self.softmax(y_pred)
                loss += -np.log(probs[target] + 1e-8)

                dV = np.outer(probs, h[-1])
                dV[target] -= h[-1]

                dh = np.dot(self.V.T, probs)
                for t in reversed(range(len(seq))):
                    dtanh = (1 - h[t + 1] ** 2) * dh
                    x_t = np.zeros(self.vocab_size)
                    x_t[seq[t]] = 1
                    dU = np.outer(dtanh, x_t)
                    dW = np.outer(dtanh, h[t])

                    self.U -= lr * dU
                    self.W -= lr * dW

                self.V -= lr * dV

            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def predict(self, seq):
        y, _ = self.forward(seq)
        probs = self.softmax(y)
        return np.random.choice(len(probs), p=probs)

    def generate(self, start_tokens, n_words, word2idx, idx2word):
        result = start_tokens[:]
        seq = [word2idx.get(w, 0) for w in result]
        for _ in range(n_words):
            next_idx = self.predict(seq)
            result.append(idx2word[next_idx])
            seq.append(next_idx)
        return ' '.join(result)

**Загрузка датасета**

In [33]:
with open("dataset.txt", encoding='utf-8') as f:
    text = f.read()

**Подготовка данных**


In [42]:
processor = TextProcessor()
tokens = processor.tokenize(text)
lemmas = processor.lemmatize(tokens)
processor.build_vocab(lemmas)

seq_len = 5
X, y = [], []
encoded = processor.encode(lemmas)
for i in range(len(encoded) - seq_len):
    X.append(encoded[i:i+seq_len])
    y.append(encoded[i+seq_len])

**Обучение модели**

In [48]:
gpt = SimpleGPT(vocab_size=len(processor.vocab), hidden_dim=50)
gpt.train(X, y, epochs=10, lr=0.01)

Epoch 0, Loss: 11002.4103
Epoch 1, Loss: 11001.8999
Epoch 2, Loss: 11001.3780
Epoch 3, Loss: 11000.8345
Epoch 4, Loss: 11000.2590
Epoch 5, Loss: 10999.6407
Epoch 6, Loss: 10998.9687
Epoch 7, Loss: 10998.2313
Epoch 8, Loss: 10997.4164
Epoch 9, Loss: 10996.5108


**Запуск**

In [54]:
user_input = input("Введите начальные 1-2 слова: ").strip().lower()
start_tokens = tp.tokenize(user_input)
start_lemmas = tp.lemmatize(start_tokens)
start_known = [lemma for lemma in start_lemmas if lemma in tp.word2idx]

if len(start_known) < 2:
    print("Недостаточно известных слов в вводе. Пожалуйста, введите минимум два известных слова.")
else:
    print("Сгенерированный текст:")
    print(gpt.generate(start_known, n_words=random.randint(2, 5), word2idx=processor.word2idx, idx2word=processor.idx2word))

Введите начальные 1-2 слова: фуиндзюцу чакра
Сгенерированный текст:
фуиндзюца чакра война образование присущий для использовать
