In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [25]:
def load_text_data(url):
    response = requests.get(url)

    if response.status_code == 200:
        text_data = response.text
        return text_data
    else:
        raise Exception(f"Failed to retrieve the content. Status code: {response.status_code}")


In [26]:
def preprocess_text_data(text_data, seq_length):
    chars = sorted(list(set(text_data)))
    char_to_index = {char: i for i, char in enumerate(chars)}
    index_to_char = {i: char for i, char in enumerate(chars)}

    sequences = []
    next_chars = []

    for i in range(len(text_data) - seq_length):
        seq = text_data[i:i + seq_length]
        next_char = text_data[i + seq_length]
        sequences.append(seq)
        next_chars.append(next_char)

    X = np.zeros((len(sequences), seq_length, len(chars)), dtype=np.bool)
    y = np.zeros((len(sequences), len(chars)), dtype=np.bool)

    for i, seq in enumerate(sequences):
        for t, char in enumerate(seq):
            X[i, t, char_to_index[char]] = 1
        y[i, char_to_index[next_chars[i]]] = 1

    return X, y, char_to_index, index_to_char

In [27]:
def build_model(seq_length, num_chars):
    model = Sequential()
    model.add(LSTM(128, input_shape=(seq_length, num_chars)))
    model.add(Dense(num_chars, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [28]:
def train_model(model, X_train, y_train, epochs=20, batch_size=64, validation_split=0.1):
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)


In [29]:
def generate_text(model, start_text, char_to_index, index_to_char, seq_length, temperature=1.0, num_chars=100):
    generated_text = start_text

    for _ in range(num_chars):
        x_pred = np.zeros((1, seq_length, len(char_to_index)))
        for t, char in enumerate(start_text):
            x_pred[0, t, char_to_index[char]] = 1.0

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = index_to_char[next_index]

        generated_text += next_char
        start_text = start_text[1:] + next_char

    return generated_text

In [30]:
url = "https://raw.githubusercontent.com/abdoelsayed2016/HKR_Dataset/master/LICENSE.CC-BY-NC-ND-4.0"
text_data = load_text_data(url)

In [31]:
seq_length = 50

In [32]:
X, y, char_to_index, index_to_char = preprocess_text_data(text_data, seq_length)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.zeros((len(sequences), seq_length, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sequences), len(chars)), dtype=np.bool)


In [33]:
def build_model(seq_length, num_chars):
    model = Sequential()
    model.add(LSTM(128, input_shape=(seq_length, num_chars)))
    model.add(Dense(num_chars, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [34]:
def train_model(model, X_train, y_train, epochs=20, batch_size=64, validation_split=0.1):
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)


In [35]:
def generate_text(model, start_text, char_to_index, index_to_char, seq_length, temperature=1.0, num_chars=100):
    generated_text = start_text

    for _ in range(num_chars):
        x_pred = np.zeros((1, seq_length, len(char_to_index)))
        for t, char in enumerate(start_text):
            x_pred[0, t, char_to_index[char]] = 1.0

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = index_to_char[next_index]

        generated_text += next_char
        start_text = start_text[1:] + next_char

    return generated_text


In [36]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [37]:
seq_length = X.shape[1]
num_chars = X.shape[2]
model = build_model(seq_length, num_chars)
train_model(model, X, y)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [44]:
start_text = "Lorem ipsum "
generated_text = generate_text(model, start_text, char_to_index, index_to_char, seq_length, temperature=0.5, num_chars=200)


In [45]:
print(generated_text)

Lorem ipsum ardteceeete t teateui rteet uaaae ttiieeeeteo iorree tedeeecgdeiadorecterettaiaee ttgctt  teiote ieeeoeeyrrg tgeat tiaotueeetrige ttetw ieertriiatasttigt eiiteoeurc erteereteomtettteeetretd
tgpig,ers 
