## Загрузка датасета и его очистка

In [1]:
!pip install kaggle
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!echo '{"username":"midlow","key":"19e4a7b3c26e4d040a5179c6b36318cd"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/wikibooks-dataset
!unzip wikibooks-dataset.zip
!rm wikibooks-dataset.zip
!rm *.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
rm: cannot remove '/root/.kaggle': No such file or directory
Downloading wikibooks-dataset.zip to /content
100% 1.82G/1.82G [01:20<00:00, 25.2MB/s]
100% 1.82G/1.82G [01:20<00:00, 24.2MB/s]
Archive:  wikibooks-dataset.zip
  inflating: wikibooks.sqlite        
rm: cannot remove '*.zip': No such file or directory


In [2]:
!pip install keras-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-nlp
  Downloading keras_nlp-0.5.2-py3-none-any.whl (527 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.7/527.7 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-text (from keras-nlp)
  Downloading tensorflow_text-2.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m108.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text, keras-nlp
Successfully installed keras-nlp-0.5.2 tensorflow-text-2.12.1


In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [40]:
import sqlite3

con = sqlite3.connect("wikibooks.sqlite")
df = pd.read_sql_query("SELECT body_text FROM en limit 10000", con)
df.head()

Unnamed: 0,body_text
0,Front Page: Radiation Oncology | RTOG Trials |...
1,Băuturi/Beverages[edit | edit source]\nTea : C...
2,Karrigell is an open Source Python web framewo...
3,setupUnitPanel[edit | edit source]\nHelper fun...
4,Contents\n\n1 The Concept\n2 The System\n3 The...


In [41]:
def preprocess_text(s):
    s = str(s)
    s = re.sub(r'[^a-zA-Z0-9 ]', ' ', s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [42]:
bodies = df['body_text'].values
bodies = list(map(preprocess_text, bodies))

In [43]:
df['body_text'] = pd.DataFrame(bodies)
del bodies
df.head()

Unnamed: 0,body_text
0,Front Page Radiation Onlogy RTOG Trials Random...
1,B uturi Beverages edit edit source Tea Ceai Mi...
2,Karrigell is an open Source Python web framewo...
3,setupUnitPanel edit edit source Helper functio...
4,Contents The Concept The System The Data LMI E...


In [44]:
ds_train, ds_test = train_test_split(df['body_text'], test_size=0.2, random_state=42)

In [45]:
print(f"Length of train dataset = {len(ds_train)}")
print(f"Length of test dataset = {len(ds_test)}")

Length of train dataset = 8000
Length of test dataset = 2000


## Создание и обучение модели

In [46]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras

In [83]:
BATCH_SIZE = 128
SEQ_LEN = 128
MIN_TRAINING_SEQ_LEN = 450
EMBED_DIM = 256
FEED_FORWARD_DIM = 256
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000
EPOCHS = 50
NUM_TOKENS_TO_GENERATE = 80

In [65]:
raw_train_ds = (
    tf.data.Dataset.from_tensor_slices(ds_train.values)
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

raw_val_ds = (
    tf.data.Dataset.from_tensor_slices(ds_test.values)
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)

In [66]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

In [59]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [85]:
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [86]:
inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=['acc'])

In [87]:
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 256)        1312768   
 g_5 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_decoder_13 (Tra  (None, None, 256)        394749    
 nsformerDecoder)                                                
                                                                 
 transformer_decoder_14 (Tra  (None, None, 256)        394749    
 nsformerDecoder)                                                
                                                                 
 dense_5 (Dense)             (None, None, 5000)        1285

In [88]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, verbose=2)

Epoch 1/50
98/98 - 33s - loss: 7.2174 - acc: 0.0354 - val_loss: 7.1251 - val_acc: 0.0359 - 33s/epoch - 334ms/step
Epoch 2/50
98/98 - 9s - loss: 6.8440 - acc: 0.0632 - val_loss: 6.1579 - val_acc: 0.1211 - 9s/epoch - 89ms/step
Epoch 3/50
98/98 - 10s - loss: 5.7062 - acc: 0.1473 - val_loss: 5.5261 - val_acc: 0.1609 - 10s/epoch - 99ms/step
Epoch 4/50
98/98 - 10s - loss: 5.1686 - acc: 0.1755 - val_loss: 5.3077 - val_acc: 0.1760 - 10s/epoch - 100ms/step
Epoch 5/50
98/98 - 10s - loss: 4.8435 - acc: 0.1963 - val_loss: 5.1957 - val_acc: 0.1883 - 10s/epoch - 97ms/step
Epoch 6/50
98/98 - 9s - loss: 4.5838 - acc: 0.2182 - val_loss: 5.1313 - val_acc: 0.1990 - 9s/epoch - 95ms/step
Epoch 7/50
98/98 - 9s - loss: 4.3601 - acc: 0.2392 - val_loss: 5.1049 - val_acc: 0.2065 - 9s/epoch - 89ms/step
Epoch 8/50
98/98 - 10s - loss: 4.1577 - acc: 0.2595 - val_loss: 5.1010 - val_acc: 0.2119 - 10s/epoch - 102ms/step
Epoch 9/50
98/98 - 10s - loss: 3.9731 - acc: 0.2802 - val_loss: 5.1267 - val_acc: 0.2153 - 10s/epoc

<keras.callbacks.History at 0x7f4caacfe410>

## Генерация текстов

In [91]:
prompt_tokens = start_packer(tokenizer([""]))
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>

In [92]:
def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    hidden_states = None
    return logits, hidden_states, cache

In [93]:
sampler = keras_nlp.samplers.GreedySampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
[b'[BOS] contents introduction history case studies areas index for hallo study for nas contents nas of ass and honorary devices for ho ho hoint three processing administrations overview note ho study questions motoroliethia to prohibites verbs indexybrotecasts bookiethroontine star constructional convention on edit edit source county contents edit edit source wikimedia objective of additional steps of additional information age procomics is a list of c programming languages abstract derived from cystic fa hpheritance majority of hous language skills largely warrion']



In [81]:
sampler = keras_nlp.samplers.BeamSampler(num_beams=5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Beam search generated text: \n{txt}\n")

Beam search generated text: 
[b'[BOS] this page was imported and needs to be de wikified books should use wikilinks rather sparsely and only to reference technical or esoteric terms that are critical to understanding the ntent most if not all wikilinks should simply be removed please remove dewikify after the page is dewikified xmcd infinity of mass spectrospy depr nmr ray crystal condent solid ray crystallog ray crystallog ray crystallograp high spectangle mass spectrometry catal spectrospy debit romeign quanthelm ray beam buozz']



In [94]:
sampler = keras_nlp.samplers.RandomSampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Random search generated text: \n{txt}\n")

Random search generated text: 
[b'[BOS] a reader requests expansion of this book to include more material you can help by adding new material learn how or ask for assistance in the reading room contents from computer science nouns predas and jennedance g references introduction edit edit source dvd refers to the ext is a device that a good question device to refer to the role quint phenyology bus the federal system analyslogists in national networkingiorating the federal software regarding the njunction status of status when operating system the teacher the money of others before describing different military lectures and methodifies or fashion the real authorized julizing the']



In [95]:
sampler = keras_nlp.samplers.TopKSampler(k=5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
[b'[BOS] contents introduction what is social resources how you can play advantages and the role of using glossary assignment changed the current sequencing methods further reading devices for a list of all of allrustrate demonialsis in reinformoinformatics management variables edit edit source eroearts management system information for example of interchial regnition zeroing is the system for a function is exec library sounds routines that the system is used to the system used to the system exception of interge of all polish allocprocessor notation programmethrogative relationship operation and family well defined of the']



In [78]:
sampler = keras_nlp.samplers.TopPSampler(p=0.7)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
[b'[BOS] there are several catalogues together using rners editors prepositions this the acrdian of video editors like the webpages as a tool other words for reading this featureless you will discussing and more often because you can use to find the other contents adeque to learns other word entries print text editors preverbs usages as backspace selection tool writes of text edit edit source text here edit edit source when there is a bd for each page is no easy to say here you can write a gb bar left or left or rick the title hello world of the send tool script if you ve never']



In [96]:
class TopKTextGenerator(keras.callbacks.Callback):
    def __init__(self, k):
        self.sampler = keras_nlp.samplers.TopKSampler(k)

    def on_epoch_end(self, epoch, logs=None):
        output_tokens = self.sampler(
            next=next,
            prompt=prompt_tokens,
            index=1,
        )
        txt = tokenizer.detokenize(output_tokens)
        print(f"Top-K search generated text: \n{txt}\n")


text_generation_callback = TopKTextGenerator(k=10)
model.fit(train_ds.take(1), verbose=2, epochs=2, callbacks=[text_generation_callback])

Epoch 1/2
Top-K search generated text: 
[b'[BOS] contents introduction structure of enzymes mechanism of enzymes references introduction edit edit source enzymes many separate creating shared proteins and genetic processes are earactivity in cellular repdig reduce cystecharing to megulate different nutrition behaviour edit edit source an in a function is useful techniques for the mechanisms of the potential can be clearly ntribreas are essential stages of essential partic section protein that allow us toyste to protein molecules are functional for us these reactions to references to include a functional for the functional for example of the functional organs density prosector nurse protein nutrition in']

1/1 - 6s - loss: 0.9629 - acc: 0.7774 - 6s/epoch - 6s/step
Epoch 2/2
Top-K search generated text: 
[b'[BOS] this book is a list of the best things about the pages with peer target type of control structures on a system list of external links to relation permissionsuse and rule and or 

<keras.callbacks.History at 0x7f4c78774cd0>