In [2]:
import os
#get current working directory
path = os.getcwd()
#path to lavrov json file
text = open('/lavrov.json', 'rb').read().decode(encoding='utf-8')
print("Text is {} characters long".format(len(text)))
#split full json file into a basic number analaysis
words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print("Text is {} words long".format(len(words)))

Text is 27019858 characters long
Text is 4260425 words long


In [3]:
import json
import string
articles_string = ''
with open("/lavrov.json") as f:
    jsonString = f.read()
    jsonData = json.loads(jsonString)
    for i in range(0, len(jsonData)):
      cleaned_articletext= jsonData[i]['ArticleText'].replace('Toggle navigation      / / Asset Publisher' ,'')
      cleaned_articletext = cleaned_articletext.replace('Advanced settings         Switcher      6 Photos close','')
      articles_string = articles_string + ' ' + cleaned_articletext

def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
tokens = clean_doc(articles_string)
print('Total Tokens: {}'.format(len(tokens)))
print('Unique Tokens: {}'.format(len(set(tokens))))


# save sequences to file
#out_filename = 'lavrov_sequences.txt'
#save_doc(sequences, out_filename)

#load file from save
#lavrov_sequences = open('/lavrov_sequences.txt', 'rb').read().decode(encoding='utf-8')

Total Tokens: 4074027
Unique Tokens: 34640


In [4]:
import numpy as np
text = articles_string
#Map unique characters to indices
vocab = sorted(set(text))
char2int = {c:i for i, c in enumerate(vocab)}
int2char = np.array(vocab)


text_as_int = np.array([char2int[ch] for ch in text], dtype=np.int32)
print ('{}\n mapped to integers:\n {}'.format(repr(text[:100]), text_as_int[:100]))
tr_text = text_as_int[:21615872] #text separated for training, divisible by the batch size (64)
val_text = text_as_int[21615872:] #text separated for validation

print(text_as_int.shape, tr_text.shape, val_text.shape)


'  18 November 2019 2364-18-11-2019 Mr Makei, Mr Rapota, Colleagues, friends, Welcome to a joint meet'
 mapped to integers:
 [ 0  0 17 24  0 46 77 84 67 75 64 67 80  0 18 16 17 25  0 18 19 22 20 13
 17 24 13 17 17 13 18 16 17 25  0 45 80  0 45 63 73 67 71 12  0 45 80  0
 50 63 78 77 82 63 12  0 35 77 74 74 67 63 69 83 67 81 12  0 68 80 71 67
 76 66 81 12  0 55 67 74 65 77 75 67  0 82 77  0 63  0 72 77 71 76 82  0
 75 67 67 82]
(25515589,) (21615872,) (3899717,)


In [0]:
batch_size = 128
buffer_size = 10000
embedding_dim = 256
epochs = 50
seq_length = 200
examples_per_epoch = len(text)//seq_length
#lr = 0.001 #will use default for Adam optimizer
rnn_units = 1024
vocab_size = len(vocab)

In [6]:
import tensorflow as tf

tr_char_dataset = tf.data.Dataset.from_tensor_slices(tr_text)
val_char_dataset = tf.data.Dataset.from_tensor_slices(val_text)
print(tr_char_dataset, val_char_dataset)
tr_sequences = tr_char_dataset.batch(seq_length+1, drop_remainder=True)
val_sequences = val_char_dataset.batch(seq_length+1, drop_remainder=True)
print(tr_sequences, val_sequences)

for item in tr_sequences.take(1):
    print(repr(''.join(int2char[item.numpy()])))
    print(item)
for item in val_sequences.take(1):
    print(repr(''.join(int2char[item.numpy()])))
    print(item)

<TensorSliceDataset shapes: (), types: tf.int32> <TensorSliceDataset shapes: (), types: tf.int32>
<BatchDataset shapes: (201,), types: tf.int32> <BatchDataset shapes: (201,), types: tf.int32>
'  18 November 2019 2364-18-11-2019 Mr Makei, Mr Rapota, Colleagues, friends, Welcome to a joint meeting of the collegiums of the foreign ministries of Russia and the Republic of Belarus. This meeting i'
tf.Tensor(
[ 0  0 17 24  0 46 77 84 67 75 64 67 80  0 18 16 17 25  0 18 19 22 20 13
 17 24 13 17 17 13 18 16 17 25  0 45 80  0 45 63 73 67 71 12  0 45 80  0
 50 63 78 77 82 63 12  0 35 77 74 74 67 63 69 83 67 81 12  0 68 80 71 67
 76 66 81 12  0 55 67 74 65 77 75 67  0 82 77  0 63  0 72 77 71 76 82  0
 75 67 67 82 71 76 69  0 77 68  0 82 70 67  0 65 77 74 74 67 69 71 83 75
 81  0 77 68  0 82 70 67  0 68 77 80 67 71 69 76  0 75 71 76 71 81 82 80
 71 67 81  0 77 68  0 50 83 81 81 71 63  0 63 76 66  0 82 70 67  0 50 67
 78 83 64 74 71 65  0 77 68  0 34 67 74 63 80 83 81 14  0 52 70 71 81  0
 75 67 67

In [7]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

tr_dataset = tr_sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
val_dataset = val_sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
print(tr_dataset, val_dataset)

<BatchDataset shapes: ((128, 200), (128, 200)), types: (tf.int32, tf.int32)> <BatchDataset shapes: ((128, 200), (128, 200)), types: (tf.int32, tf.int32)>


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dropout(0.2), 
        tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [0]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

In [10]:
for input_example_batch, target_example_batch in tr_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "respectively: batch_size, sequence_length, vocab_size")

(128, 200, 89) respectively: batch_size, sequence_length, vocab_size


In [11]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print("Input: \n", repr("".join(int2char[input_example_batch[0]])))
print()
print("Predictions: \n", repr("".join(int2char[sampled_indices ])))

Input: 
 's. These are steps in the right direction. We will try and help them become a reality. As soon as it becomes clear that the decisions of the October 2016 summit in Berlin are fulfilled, I think that a'

Predictions: 
 "cEeesQ9%2C'lJJg2FWNMhav<4nmEvY>7b8s)@'o9]0*PBtd*?Djz<D@G9n;#l>; <DU-+03tC`dGR/r<MAp%UAFUf vsJ DWG9Z.lIYTe!4o&$di3vdUjaBd8zJ&FKx.xpYv_Hx?n/CEg;<@h#hhdqef(X?_fr6M@<A%h5rqi-dv<E]d(6bT,YLU6)vrUEjeJ4JetswC"


In [12]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
def accuracy(labels, logits):
    return tf.keras.metrics.sparse_categorical_accuracy(labels, logits)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
example_batch_acc  = accuracy(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Loss:      ", example_batch_loss.numpy().mean())
print("Accuracy:      ", example_batch_acc.numpy().mean())

Prediction shape:  (128, 200, 89)  # (batch_size, sequence_length, vocab_size)
Loss:       4.488257
Accuracy:       0.020117188


In [0]:
optimizer = tf.keras.optimizers.Adam() 
#deafults = learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False
model.compile(optimizer=optimizer, loss=loss)

In [0]:
patience = 2
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)

In [0]:
import datetime
checkpoint_dir = './checkpoints'+ datetime.datetime.now().strftime("_%Y.%m.%d-%H:%M:%S")
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [15]:
history = model.fit(tr_dataset, epochs=epochs, callbacks=[checkpoint_callback, early_stop] , validation_data=val_dataset)
print ("Training stopped as there was no improvement after {} epochs".format(patience))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Training stopped as there was no improvement after 2 epochs


In [25]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,9))
plt.plot(history.history['loss'], 'g')
plt.plot(history.history['val_loss'], 'rx') #use if have val data
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper right')
plt.legend(['Train', 'Validation'], loc='upper right') #use if have val date
plt.show()

NameError: ignored

<Figure size 864x648 with 0 Axes>

In [26]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) 
model.build(tf.TensorShape([1, None]))
def generate_text(model, start_string):
    
    print('Generating with seed: "' + start_string + '"')
  
    num_generate = 1000
    input_eval = [char2int[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    temperature = 1.0
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions,      num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(int2char[predicted_id])
    return (start_string + ''.join(text_generated))
print(generate_text(model, start_string="In"))

Generating with seed: "In"
Independent article so far, as the commission, is still in the process of solving the problem of e of Russian culture.  We paid special attention to the fordation of a new document where the decision made by US group of states in personnel that could make it possible to create a shift to the synthesis of the conflicts in Iraq, where Abkhazia and South Ossetia cannot be notified. We presume that we want to give the goal of preventing a provocation and help curb this in Libya now. I hope all what is happening in Abkhazia and South Ossetia.  We agreed to have enough organized state auspices.  We regret that the centre of document Development Special Tows and the choice the all of the real financial system in Europe is already a  interview his attention to this issue.  We do not understand why the conference worked as a result of your decision to commence the decisions of the UN Security Council, which centralizes it are being clearly decided to come.   At the sa