In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '15'

In [2]:
base_dir = 'archive/shahname.csv'

In [3]:
import csv
text = []
with open(base_dir, 'r') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        text.append(row[4])


text = [sub.replace('\xa0', ' ') for sub in text]
text = [sub.replace('\u200c', ' ') for sub in text]
text = [sub.replace('آ', 'ا') for sub in text]
text = [sub.replace('َ', '') for sub in text]
text = [sub.replace('ُ', '') for sub in text]
text = [sub.replace('ِ', '') for sub in text]
text = [sub.replace('ة', 'ه') for sub in text]
text = [sub.replace('هٔ', 'ه') for sub in text]
text = [sub.replace('ك', 'ک') for sub in text]
text = [sub.replace('ئ', 'ی') for sub in text]
text = [sub.replace('؛', '') for sub in text]
text = [sub.replace('ّ', '') for sub in text]
text = [sub.replace('ْ', '') for sub in text]
text = [sub.replace('،', '') for sub in text]
text = [sub.replace('ء', '') for sub in text]
text = [sub.replace('«', '') for sub in text]
text = [sub.replace('»', '') for sub in text]
text = [sub.replace('أ', 'ا') for sub in text]
text = [sub.replace(')', '') for sub in text]
text = [sub.replace('(', '') for sub in text]
text = [sub.replace('ؤ', 'و') for sub in text]
text = [sub.replace('؟', '') for sub in text]
text = [sub.replace('!', '') for sub in text]
text = [sub.replace(':', '') for sub in text]
text = [sub.replace('ي', 'ی') for sub in text]

text = " ".join(text[0:(len(text) + 1)])

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
vocab = tokenizer.word_index
reversed_dict = {}
for key, value in vocab.items():
    reversed_dict[value] = key

In [5]:
import numpy as np
import tensorflow as tf

In [6]:
text_as_int = np.array([vocab[word] for word in text.split(' ') if word != ''])

In [7]:
text_as_int[:13]

array([   2,   78,  364,  113,    1,   99,  140,   25,  670,  219,    6,
       2395,  364])

In [8]:
seq_length = 10
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
for i in char_dataset.take(10):
    print(reversed_dict[i.numpy()])

به
نام
خداوند
جان
و
خرد
کز
این
برتر
اندیشه


In [9]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)
for item in sequences.take(2):
    chars = [reversed_dict[int(i)] for i in item.numpy()]
    print(repr(' '.join(chars)))
    print("***" * 5)

'به نام خداوند جان و خرد کز این برتر اندیشه بر'
***************
'نگذرد خداوند نام و خداوند جای خداوند روزی ده رهنمای خداوند'
***************


In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [11]:
dataset

<_MapDataset element_spec=(TensorSpec(shape=(10,), dtype=tf.int64, name=None), TensorSpec(shape=(10,), dtype=tf.int64, name=None))>

In [12]:
for input_example, target_example in dataset.take(1):
    print('Input data: ')
    words = [reversed_dict[int(i)] for i in input_example.numpy()]
    print(repr(' '.join(words)))
    print ('Target data:')
    words = [reversed_dict[int(i)] for i in target_example.numpy()]
    print(repr(' '.join(words)))

Input data: 
'به نام خداوند جان و خرد کز این برتر اندیشه'
Target data:
'نام خداوند جان و خرد کز این برتر اندیشه بر'


In [13]:
BATCH_SIZE = 32
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<_BatchDataset element_spec=(TensorSpec(shape=(32, 10), dtype=tf.int64, name=None), TensorSpec(shape=(32, 10), dtype=tf.int64, name=None))>

In [14]:
vocab_size = len(vocab)
embedding_dim = 50
rnn_units = 1024

In [15]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [16]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [17]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model.predict(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(32, 10, 16862) # (batch_size, sequence_length, vocab_size)


In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (32, None, 50)            843100    
                                                                 
 lstm (LSTM)                 (32, None, 1024)          4403200   
                                                                 
 dense (Dense)               (32, None, 16862)         17283550  
                                                                 
Total params: 22529850 (85.94 MB)
Trainable params: 22529850 (85.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [20]:
sampled_indices

array([ 5128,  5201, 10879, 15791,  3937,  7601, 14318,  5974,  7011,
       13822])

In [21]:
words = [reversed_dict[int(i)] for i in input_example_batch[0].numpy()]
print(repr(' '.join(words)))

words = [reversed_dict[int(i)] for i in sampled_indices]
print(repr(' '.join(words)))

'به نام خداوند جان و خرد کز این برتر اندیشه'
'دوکدان جادوپرست کاسپم پرنداوری براشفته بهرمزد جرنگ گزیدم ازاددل خوردشان'


In [22]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [23]:
model.compile(optimizer='adam', loss=loss)

In [24]:
checkpoint_dir = './training_checkpoints_WordLevel'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [25]:
history = model.fit(dataset, epochs=20, callbacks=[checkpoint_callback])

Epoch 1/20


I0000 00:00:1717782487.808257   57880 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
training_checkpoints = './training_checkpoints_WordLevel'

In [29]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(training_checkpoints))
model.build(tf.TensorShape([1, None]))

In [30]:
def generate_text(model, start_string):
    num_generate = 100
    input_eval = [vocab[s] for s in start_string.split(' ')]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(reversed_dict[predicted_id])
    return (start_string + ' ' + ' '.join(text_generated))

In [34]:
print(generate_text(model, start_string="سلام"))

سلام تدبیر نگارش چنویی دستور چوخشنود فروریختند یازد زودی ز افراسیاب بیاورد زاغ اندر امد به گردش بهشت کمان را هزار همی رخش گویی بروبر دراز یکی گفت ایدر شما از خدای به نخچیر گردی و گردنکشان بماند ازیشان تو این با تو داشتم اگر صد و پولاد بیرون مده به اشتاد فرمود تا تیره شد هم به نزدیک شهر چنین است با فر یزدان نگه کرد نیز یکی انک بر سر و مهتر کنند تو با من خرد باد زین چنین مدار از کهان و مهان برین رنج شادست و شمشیر زن ز هر بد که بد نامدار کزو جنگ
