In [2]:
import tensorflow as tf

import numpy as np
import os
import time

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


In [3]:


txt = open("/home/retr0/tweets.txt","rb").read().decode(encoding='utf-8')
print("Txt len: {}".format(len(txt)))

vocab = sorted(set(txt))
print("{} Unique chars".format(len(vocab)))

unique = len(vocab)

#vectorizing
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in txt])
print(text_as_int)


# The maximum length sentence we want for a single input in characters
seq_length = 280
examples_per_epoch = len(txt)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

dataset = sequences.map(split_input_target)

for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))


for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))





Txt len: 214994
144 Unique chars
[135  50  65 ...  61   2   0]
Input data:  '“There has never been a president in the White House who has been more supportive of HBCUs and their mission than President Trump”\n....the possible exception of another Republican President, the late, great, Abraham Lincoln...and it’s not even close. The Democrats know this, and '
Target data: 'There has never been a president in the White House who has been more supportive of HBCUs and their mission than President Trump”\n....the possible exception of another Republican President, the late, great, Abraham Lincoln...and it’s not even close. The Democrats know this, and s'
Step    0
  input: 135 ('“')
  expected output: 50 ('T')
Step    1
  input: 50 ('T')
  expected output: 65 ('h')
Step    2
  input: 65 ('h')
  expected output: 62 ('e')
Step    3
  input: 62 ('e')
  expected output: 75 ('r')
Step    4
  input: 75 ('r')
  expected output: 62 ('e')


In [4]:
#Batches
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print(dataset)


#Model
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


<BatchDataset shapes: ((64, 280), (64, 280)), types: (tf.int64, tf.int64)>
(64, 280, 144) # (batch_size, sequence_length, vocab_size)


In [5]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()


print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'L! GOD BLESS THE\n\nRobert, we will end up stronger than ever before. Thank you!\nWe will be guided by the wishes of Prime Minister Abe of Japan, a great friend of the United States and a man who has done a magnificent job on the Olympic Venue, as to attending the Olympic Games in J'

Next Char Predictions: 
 '=qं="खठुऔ—🇮P2़pl\u202ft–tー़3uP;चXbMते🇳wEO\'@_नअkIb—#YshलvKW🇺ड1रr🇺ैघ3इaय\'.एdुa.fड@s?,&ि ष4Jठ–नघऔ5‘!वM\u202fलYlएー=़Pँーटद#\nूーधरै”झ8@\u202fLप=षड🇺Yह”इउघjcूंv\n7ँम0🇳\'+छेुं%VंM🇺गv🇮0ािघXग/आउऔxL=;नGगमw”C-खcहnआ“डmउ0!nA🇸N5jvs8ल?🇸PAधडझपR”नWqषtबDwKीF@–k0\'\'”1C–ष%—SOZe"#6\u202fEUMlNSPइ\nnसSLंऔुस,aद9Eड-घ6d।चB_प🇸Yu–NU“'


In [6]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 280, 144)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.970621


In [12]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [14]:
model.compile(optimizer='adam', loss=loss)


In [15]:
EPOCHS=10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Train for 11 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
tf.train.latest_checkpoint(checkpoint_dir)


'./training_checkpoints/ckpt_10'

In [17]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))


In [18]:
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            36864     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 144)            147600    
Total params: 4,122,768
Trainable params: 4,122,768
Non-trainable params: 0
_________________________________________________________________


In [19]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


In [20]:
print(generate_text(model, start_string=u"The"))


Thean 5. Dere &aowdsimlecags. @Sthets G He; s ho basudderren, whox they ate furibp hoon, wold he toedicucilh and thandike Ahant yada” Jong the sall Sechan. Ne memo rcrient Fes ouk sa ciust he ond toures hliwramfif Neativathe sesa bune,!
MU.0..). Fers.. N Thanpewey ar DEY on Direnein thist weouber St oard comes, ton mouud yyen tha af buppplesingres bem.
C in mergoft!
@ReNeVest wate rodhat, un sibgion @pasy pmouly ind, Tfhin frekeys, Pren’tind ture at arsitind!
To tery vabeenteuy fheora. Nerigre &ade ther, ory als angse powe th Nedy Rasing moucle porske Fanstre ollerika pe pant ngy raverty in wigrty ur Con the ghar yt3 Wompibs Fogpif wallly avef tathang Coon wing sis Gordkoce ty yous overonke gramares if hot Netat bug Thin P Poto weint. Brmeas he whe andas, dat Cureper thesors atk nam @N, Nheade clisg is Seigitn the OPqrinmer, ths wat bor tocono ghered was, Wemer to be ing bat elin tha cof as, Fust on tr bouk! Svao dof ofly oveind therere qotasi. ny Iy fon @EIUेOins yen deen s oullast to