In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import urllib.request
import re




In [2]:
from datasets_analizer import DataSetsAnalizer

DSA = DataSetsAnalizer()

text = str(DSA)

In [3]:
vocab = sorted(set(text))
",".join(vocab)

'\n, ,!,",#,$,%,&,\',(,),*,+,,,-,.,/,0,1,2,3,4,5,6,7,8,9,:,;,<,=,>,?,@,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,[,],^,_,`,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,©,\xad,°,Ё,А,Б,В,Г,Д,Е,Ж,З,И,Й,К,Л,М,Н,О,П,Р,С,Т,У,Ф,Х,Ц,Ч,Ш,Щ,Ъ,Ы,Ь,Э,Ю,Я,а,б,в,г,д,е,ж,з,и,й,к,л,м,н,о,п,р,с,т,у,ф,х,ц,ч,ш,щ,ъ,ы,ь,э,ю,я,ё,’,“,”,€,№,�'

In [4]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
[f"{char} = {i}" for char, i in zip(char2idx, range(20))]

['\n = 0',
 '  = 1',
 '! = 2',
 '" = 3',
 '# = 4',
 '$ = 5',
 '% = 6',
 '& = 7',
 "' = 8",
 '( = 9',
 ') = 10',
 '* = 11',
 '+ = 12',
 ', = 13',
 '- = 14',
 '. = 15',
 '/ = 16',
 '0 = 17',
 '1 = 18',
 '2 = 19']

In [5]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
[idx2char[i.numpy()] for i in char_dataset.take(5)]

['Ц', 'Б', ' ', 'Р', 'Ф']

In [6]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
[repr(''.join(idx2char[item.numpy()])) for item in sequences.take(5)]

["'ЦБ РФ объявил о переходе от плавающего курса рубля к тонущему.\\n- Ваши политические взгляды?- Самец.\\nЯ'",
 "' ужинать хочу. Куда враги все подевались?\\nПо утрам бегают те, кто ночью бездельничал.\\n- Кого ты боишь'",
 "'ся больше всего на свете?- Темноты и стоматологов.- Ну, стоматологов - ясно, а темноты-то почему?- А '",
 "'кто знает, сколько их там, в темноте-то стоматологов!\\nИдет первая русско-украинская война в интернете'",
 "'. Убитых нет, но много раненых в голову.\\nПочему россияне не боятся антироссийских санкций? Людей, кот'"]

In [7]:
@tf.autograph.experimental.do_not_convert
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [8]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'ЦБ РФ объявил о переходе от плавающего курса рубля к тонущему.\n- Ваши политические взгляды?- Самец.\n'
Target data: 'Б РФ объявил о переходе от плавающего курса рубля к тонущему.\n- Ваши политические взгляды?- Самец.\nЯ'


In [9]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 117 ('Ц')
  expected output: 96 ('Б')
Step    1
  input: 96 ('Б')
  expected output: 1 (' ')
Step    2
  input: 1 (' ')
  expected output: 111 ('Р')
Step    3
  input: 111 ('Р')
  expected output: 115 ('Ф')
Step    4
  input: 115 ('Ф')
  expected output: 1 (' ')


In [10]:
BATCH_SIZE = 512
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
rnn_units_2 = 512

In [11]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[BATCH_SIZE, None]),
  tf.keras.layers.GRU(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'), 
  tf.keras.layers.GRU(rnn_units_2,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),  
  tf.keras.layers.Dense(vocab_size)
])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (512, None, 256)          42496     
                                                                 
 gru (GRU)                   (512, None, 1024)         3938304   
                                                                 
 gru_1 (GRU)                 (512, None, 512)          2362368   
                                                                 
 dense (Dense)               (512, None, 166)          85158     
                                                                 
Total params: 6428326 (24.52 MB)
Trainable params: 6428326 (24.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)




In [13]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [14]:
history = model.fit(dataset, epochs=3, callbacks=[checkpoint_callback])

Epoch 1/3

  5/318 [..............................] - ETA: 2:00:16 - loss: 4.7653

KeyboardInterrupt: 

In [None]:
def generate_text(model, start_string, temp, gen_chars):     
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)  
  text_generated = []
  model.reset_states()
  for i in range(gen_chars):
    predictions = model(input_eval)      
    predictions = tf.squeeze(predictions, 0)
    predictions = predictions / temp
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predicted_id], 0)
    text_generated.append(idx2char[predicted_id])  
  return (start_string + ''.join(text_generated))

: 

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[1, None]),
  tf.keras.layers.GRU(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
  tf.keras.layers.GRU(rnn_units_2,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'), 
  tf.keras.layers.Dense(vocab_size)
])
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[1, None]),
  tf.keras.layers.GRU(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
  tf.keras.layers.GRU(rnn_units_2,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'), 
  tf.keras.layers.Dense(vocab_size)
])
model.summary()
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

: 

In [None]:
generate_text(model, u"Появился в зоне чёрни сталкер", 1.0, 200)

: 

In [None]:
# model.save("NLP_gen_illiad_200")

: 