In [None]:
import os
import numpy as np
import tensorflow as tf

class DataProvider:
    def __init__(self, data_dir, batch_size, sequence_length):
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.pointer = 0

        # Read the data
        file_path = os.path.join(data_dir, "10-million-password-list-top-100000.txt")
        with open(file_path, "r", encoding="utf-8") as file:
            data = file.read()

        # Create a character dictionary
        self.chars = sorted(set(data))
        self.vocabulary_size = len(self.chars)
        self.char_to_idx = {ch: idx for idx, ch in enumerate(self.chars)}
        self.idx_to_char = {idx: ch for idx, ch in enumerate(self.chars)}

        # Encode data
        self.tensor = np.array([self.char_to_idx[ch] for ch in data], dtype=np.int32)

        # Calculate number of batches
        self.num_batches = int(self.tensor.size / (self.batch_size * self.sequence_length))
        if self.num_batches == 0:
            raise ValueError("Not enough data. Make the batch size and sequence length smaller.")

        # When the data (tensor) is divided into batches, you get the input tensor
        self.tensor = self.tensor[:self.num_batches * self.batch_size * self.sequence_length]
        # Create a shifted version of the tensor for the targets
        self.target_tensor = np.roll(self.tensor, -1)

        # Split input and target tensor into batches
        self.input_batches = np.reshape(self.tensor, [self.batch_size, -1])
        self.target_batches = np.reshape(self.target_tensor, [self.batch_size, -1])

        print("Tensor size:", self.tensor.size)
        print("Batch size:", self.batch_size)
        print("Sequence length:", self.sequence_length)
        print("Number of batches:", self.num_batches)
        print("")

    def next_batch(self):
        start = self.pointer * self.sequence_length
        end = start + self.sequence_length
        inputs = self.input_batches[:, start:end]
        targets = self.target_batches[:, start:end]
        self.pointer += 1
        if self.pointer >= self.num_batches:
            self.pointer = 0
        return inputs, targets

    def reset_batch_pointer(self):
        self.pointer = 0

class RNNModel(tf.keras.Model):
    def __init__(self, vocabulary_size, sequence_length, hidden_layer_size, cells_size, gradient_clip=5.):
        super(RNNModel, self).__init__()

        self.sequence_length = sequence_length
        self.batch_size = tf.Variable(0, dtype=tf.int32, trainable=False)

        # We stack multiple LSTM layers (if cells_size > 1)
        lstm_cells = [tf.keras.layers.LSTMCell(hidden_layer_size) for _ in range(cells_size)]
        self.rnn_layer = tf.keras.layers.StackedRNNCells(lstm_cells)
        self.rnn_layer = tf.keras.layers.RNN(self.rnn_layer, return_sequences=True, return_state=True)

        self.embedding_layer = tf.keras.layers.Embedding(vocabulary_size, hidden_layer_size)
        self.softmax_layer = tf.keras.layers.Dense(vocabulary_size)

        self.gradient_clip = gradient_clip

    def call(self, inputs, states=None, return_state=False, training=False):
        if states is None:
            states = self.rnn_layer.get_initial_state(inputs)

        inputs = self.embedding_layer(inputs)
        outputs, *state = self.rnn_layer(inputs, initial_state=states, training=training)
        logits = self.softmax_layer(outputs)

        if return_state:
            return logits, state
        else:
            return logits

    def train_step(self, data):
        x, y = data
        with tf.GradientTape() as tape:
            logits = self(x, training=True)  # Forward pass
            loss = self.compiled_loss(y, logits, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Clip gradients and update
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.gradient_clip)
        self.optimizer.apply_gradients(zip(clipped_gradients, trainable_vars))

        # Update the metrics configured in 'compile'.
        self.compiled_metrics.update_state(y, logits)

        # Return a dict mapping metric names to current value.
        return {m.name: m.result() for m in self.metrics}
    def sample(self, char, char_to_idx, idx_to_char, length):
        state = None
        text = ""
        for _ in range(length):
            x = np.array([[char_to_idx[char]]], dtype=np.float32)
            logits, state = self(x, states=state, return_state=True, training=False)
            probs = tf.nn.softmax(logits[-1, -1])
            idx = np.random.choice(range(len(idx_to_char)), p=probs.numpy())
            char = idx_to_char[idx]
            text += char
        return text

# Settings for the model and dataset.
batch_size = 32
sequence_length = 25
hidden_layer_size = 1024
cells_size = 3

data_provider = DataProvider("/content", batch_size, sequence_length)

# Build the model.
model = RNNModel(data_provider.vocabulary_size, sequence_length, hidden_layer_size, cells_size)

# Compile the model.
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model.
for epoch in range(75):
    data_provider.reset_batch_pointer()
    for b in range(data_provider.num_batches):
        x_batch, y_batch = data_provider.next_batch()

        # Convert batches to float32 tensors
        x_batch = tf.convert_to_tensor(x_batch, dtype=tf.float32)
        y_batch = tf.convert_to_tensor(y_batch, dtype=tf.int32)

        result = model.train_step((x_batch, y_batch))
        print(f'Epoch [{epoch}] Batch [{b}] Loss: {result["loss"]:.4f}, Accuracy: {result["accuracy"]:.4f}')

sampled_text = model.sample('h', data_provider.char_to_idx, data_provider.idx_to_char, 300)
print("Sampled text:", sampled_text)


Tensor size: 781600
Batch size: 32
Sequence length: 25
Number of batches: 977





[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
Epoch [47] Batch [746] Loss: 0.6952, Accuracy: 0.7719
Epoch [47] Batch [747] Loss: 0.6952, Accuracy: 0.7719
Epoch [47] Batch [748] Loss: 0.6952, Accuracy: 0.7719
Epoch [47] Batch [749] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [750] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [751] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [752] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [753] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [754] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [755] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [756] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [757] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [758] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [759] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [760] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [761] Loss: 0.6951, Accuracy: 0.7719
Epoch [47] Batch [762] Loss: 0.6951, Accuracy: 0.7719
Epo