In [1]:
from enum import unique
import json
import re
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, GRU
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

In [2]:
with open("../input/pacane/q.json", "r", encoding='utf8') as read_file:
    data = json.load(read_file)

In [3]:
num_words = 5000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(data)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(data)
maxlen = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)


In [4]:

input_len = maxlen
words_len = len(word_index.keys())
print(f"{input_len} inp, {words_len} words")

# I started to use tf dataset to simplify the process
# First it got a all text as list of numbers
print(train_padded)
input_dataset = tf.data.Dataset.from_tensor_slices(train_padded)
seq_length = maxlen+1

# now it holds tuples ((seq_length), (seq_length)) where first is x, second is y
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text
dataset = input_dataset.map(split_input_target)


195 inp, 8460 words
[[   5   28  151 ...    0    0    0]
 [  11  298 3183 ...    0    0    0]
 [  36   15  872 ...    0    0    0]
 ...
 [   7   29  415 ...    0    0    0]
 [ 396    4    2 ...    0    0    0]
 [   3   37  417 ...    0    0    0]]


In [5]:
# shuffle and split to batches again
batch_size = 16
embedding_dim = 54
rnn_units = 128

dataset = dataset.shuffle(1000).batch(batch_size, drop_remainder=True)
print(dataset)

<BatchDataset shapes: ((16, 194), (16, 194)), types: (tf.int32, tf.int32)>


In [6]:
def build_model(words_len, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(words_len, embedding_dim,
                              batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True),
    tf.keras.layers.Dense(words_len)
    ])
    return model

In [7]:
model = build_model(words_len, embedding_dim, rnn_units, batch_size)
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [8]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

model.fit(dataset, epochs=2, callbacks=[checkpoint_callback])
model.save('model1')

Epoch 1/2
Epoch 2/2


In [9]:
model = build_model(words_len, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.save('/model1')

In [10]:

def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
    num_generate = 20
    
  # Converting our start string to numbers (vectorizing)
    input_eval = start_string
    
    input_eval = tf.expand_dims(input_eval, 0)
    
  # Empty string to store our results
    text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
    temperature = 0.4

  # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
    # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(predicted_id)
    
    idx2word = np.array(list(word_index.keys()))
    return ( ' '.join([idx2word[i] for i in text_generated]))
print(generate_text(model, [word_index['когда']]))

ваше анон тишина объясняй —во слабостью наша унас называла прекрасен пять рулю похоронить приняли была оценок вернется прекрасном настоящего миру
