# Загрузка датасета 

Загрузим датасет №5 с Wikibooks ([датасет](https://www.kaggle.com/datasets/dhruvildave/wikibooks-dataset))

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
!pip install tensorflow==2.12.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install kaggle
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!echo '{"username":"midlow","key":"19e4a7b3c26e4d040a5179c6b36318cd"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/wikibooks-dataset
!unzip wikibooks-dataset.zip
!rm wikibooks-dataset.zip
!rm *.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
rm: cannot remove '/root/.kaggle': No such file or directory
Downloading wikibooks-dataset.zip to /content
 99% 1.81G/1.82G [00:17<00:00, 116MB/s]
100% 1.82G/1.82G [00:17<00:00, 114MB/s]
Archive:  wikibooks-dataset.zip
  inflating: wikibooks.sqlite        
rm: cannot remove '*.zip': No such file or directory


Возьмём из базы англоязычные книги.

In [18]:
import sqlite3

con = sqlite3.connect("wikibooks.sqlite")
cur = con.cursor()
res = cur.execute("SELECT body_text FROM en")
print(res.fetchone())
data = res.fetchall()
text = ""
for d in data[:2000]:
    text += d[0]
corpus = [t[0] for t in data]
del data

('Front Page: Radiation Oncology | RTOG Trials | Randomized Trials\n\n\n\n\nNon-Hodgkin lymphoma: Main Page  | Randomized\nOverview: Overview  | \nFollicular |\nDiffuse large B-cell |\nMALT |\nNodal marginal zone |\nMantle cell |\nCLL/SLL |\nLymphoblastic |\nBurkitt |\nNK/T cell |\nAnaplastic large cell |\nPrimary CNS Lymphoma\nTreatment:\nAggressive |\nSpecific sites |\nRadioimmunotherapy\n\n\n\nChronic Lymphocytic Leukemia and Small Lymphocytic Lymphoma (CLL/SLL)\n\n\nContents\n\n1 Overview\n2 Staging\n3 Classification\n4 Richter\'s transformation\n5 Radiation Therapy\n6 Reviews\n\n\n\nOverview[edit\xa0| edit source]\nCLL is the most common leukemia among adults in Western world\nIt is characterized by accumulation of mature B-cells\nCLL molecular phenotype: CD5+, CD23+, surface Ig weak, CD79b weak/absent, FMC7 neg.\nDiagnosis: lymphocytosis (often >5 x 10^9 / L, but not an absolute cutoff)\nRisk factors are male sex, advanced age, white race, and family history of CLL or lymphoproli

# Обучение модели с символьной токенизацией

In [4]:
text = text[:2000]  # берём 2000 символов
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)

sequences = tokenizer.texts_to_sequences(text)
input_data = np.array(sequences[:-1])
target_data = np.array(sequences[1:])

In [6]:
char_model = tf.keras.models.Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=10, input_length=input_data.shape[1]),
    SimpleRNN(units=32, return_sequences=True),
    TimeDistributed(Dense(units=len(tokenizer.word_index)+1, activation='softmax'))
])

char_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
char_model.fit(input_data, target_data, epochs=10, verbose=0)

<keras.callbacks.History at 0x7f912d1fc190>

In [12]:
generated_text = 'h'
for i in range(5):
    sequence = tokenizer.texts_to_sequences(generated_text)[-input_data.shape[1]:]
    sequence = np.pad(sequence, (0, input_data.shape[1]-len(sequence)), 'constant', constant_values=0)
    prediction = char_model.predict(np.array([sequence]))
    next_token = np.argmax(prediction[0][-1])
    generated_text += tokenizer.index_word[next_token]

print(generated_text)

he te 


# Обучение модели с пословной токенизацией

In [21]:
import re

raw_corpus = corpus[:5]  # берём 5 текстов
corpus = []
for t in raw_corpus:
  corpus.append(re.sub(r'[^a-zA-Z0-9 ]', ' ', t))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

In [22]:
word_model = tf.keras.Sequential()
word_model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
word_model.add(SimpleRNN(100))
word_model.add(Dense(total_words, activation='softmax'))

word_model.compile(loss='categorical_crossentropy', optimizer='adam')
word_model.fit(predictors, tf.keras.utils.to_categorical(label, num_classes=total_words), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efb88fa34c0>

In [26]:
seed_text = "The professor wants to"
next_words = 1

for _ in range(next_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=max_sequence_len-1, padding='pre')
    prob_distribution = word_model.predict(encoded)[0]
    prediction = np.argmax(prob_distribution)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == prediction:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

The professor wants to edit
