<a href="https://colab.research.google.com/github/Shivagharehzad95/Train-GPT-/blob/main/DSR_43_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import glob
import random
import shutil
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm

In [2]:
# Where the text files are going to live.
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# Just use 20 files.
file_number = 100

# Gather the corpus if it has not been gathered yet.
if not os.path.exists(dataset_path):

    # Create all the folders.
    for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
        if not os.path.exists(path):
            os.mkdir(path)

    # Clone the repo.
    !git clone https://github.com/vilmibm/lovecraftcorpus

    # Find all the files.
    paths_all = glob.glob("lovecraftcorpus/*.txt")
    print(sorted(paths_all))

    # Standardize.
    for path in paths_all:
        content = open(path).read()
        content = content.lower()
        for punctuation in ".,:;?!":
            content = content.replace(punctuation, " " + punctuation)
        open(path, "w").write(content)

    # Do not use all.
    paths_all = paths_all[:file_number]

    # Split 80/20.
    split_index = int(len(paths_all) * 0.8)
    paths_train = paths_all[:split_index]
    paths_valid = paths_all[split_index:]

    # Copy files.
    def copy(paths, destination):
        for path in paths:
            shutil.copy2(path, destination)
    copy(paths_all, dataset_path_all)
    copy(paths_train, dataset_path_train)
    copy(paths_valid, dataset_path_valid)

    # Delete repo.
    !rm -rf lovecraftcorpus

    # Done.
    print("Corpus downloaded.")

Cloning into 'lovecraftcorpus'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 74 (delta 0), reused 3 (delta 0), pack-reused 70 (from 1)[K
Receiving objects: 100% (74/74), 1.12 MiB | 2.82 MiB/s, done.
['lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/arthur_jermyn.txt', 'lovecraftcorpus/azathoth.txt', 'lovecraftcorpus/beast.txt', 'lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/celephais.txt', 'lovecraftcorpus/charles_dexter_ward.txt', 'lovecraftcorpus/clergyman.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/crawling_chaos.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/doorstep.txt', 'lovecraftcorpus/dreams_in_the_witch.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/erich_zann.txt', 'lovecraftcorpus/ex_oblivione.

In [3]:
batch_size = 32 # !!! Not for training!
seed = 42 # Seed for the random number generator.

def create_dataset(dataset_path):
    dataset = preprocessing.text_dataset_from_directory(
        dataset_path,
        labels=None,
        batch_size=batch_size,
        seed=seed
    )
    return dataset

dataset_original_all   = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)

Found 67 files.
Found 53 files.
Found 14 files.


In [4]:
for short_story_list in dataset_original_all:
    for short_story in short_story_list:
        print(short_story)

tf.Tensor(b"the music of erich zann\n\ni have examined maps of the city with the greatest care , yet have never again found the rue d'auseil . these maps have not been modern maps alone , for i know that names change . i have , on the contrary , delved deeply into all the antiquities of the place , and have personally explored every region , of whatever name , which could possibly answer to the street i knew as the rue d'auseil . but despite all i have done , it remains an humiliating fact that i cannot find the house , the street , or even the locality , where , during the last months of my impoverished life as a student of metaphysics at the university , i heard the music of erich zann .\n\nthat my memory is broken , i do not wonder ; for my health , physical and mental , was gravely disturbed throughout the period of my residence in the rue d'auseil , and i recall that i took none of my few acquaintances there . but that i cannot find the place again is both singular and perplexing 

# Sliding window - Dataset for Autoregression

In [5]:
vocabulary_size = 10_000

encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=None, # Already done!
    split="whitespace",
    output_mode="int" # Word indices. As usual. SOTA!
)
encoder.adapt(dataset_original_all)

vocabulary = encoder.get_vocabulary()
vocabulary[:10]

['',
 '[UNK]',
 np.str_('the'),
 np.str_(','),
 np.str_('and'),
 np.str_('of'),
 np.str_('.'),
 np.str_('to'),
 np.str_('a'),
 np.str_('in')]

In [6]:
sequence_length = 32
padding_token_id = 0

def create_dataset_for_autoregression(dataset, hop_length=1):
    x_inputs = []
    y_outputs = []

    for books in dataset:
        books = encoder(books).numpy()

        for book in tqdm(books):

            # Remove the padding token.
            book = [int(index) for index in list(book) if index != padding_token_id]

            for start_index in range(0, len(book) - sequence_length, hop_length):
                x = book[start_index:start_index + sequence_length]
                assert len(x) == sequence_length
                y = book[start_index + 1:start_index + sequence_length + 1] # Plot twist!
                assert len(y) == sequence_length
                x_inputs.append(x)
                y_outputs.append(y)

    return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))

dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)

100%|██████████| 32/32 [00:01<00:00, 21.68it/s]
100%|██████████| 21/21 [00:00<00:00, 23.57it/s]
100%|██████████| 14/14 [00:00<00:00, 35.53it/s]


In [7]:
def decode(indices):
    return " ".join([str(vocabulary[index]) for index in indices])


for x, y in dataset_train.shuffle(100):
    print(decode(x))
    print(decode(y))
    break # TODO: Remove later.

what he did . that abominable society took charge at last , and we don't know where he is buried . there was no way the law or anything else could reach
he did . that abominable society took charge at last , and we don't know where he is buried . there was no way the law or anything else could reach the


In [8]:
import matplotlib.pyplot as plt

def render_history(history):
    plt.title("Training loss vs. validation loss")
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.title("Training accuracy vs. validation accuracy")
    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()
    plt.close()

# Train GPT!


In [11]:

def create_transformer(sequence_length=32, embedding_size=128, layers_number=4, num_heads=4):

    # Start with input embeddings.
    inputs = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    embedding = layers.Embedding(vocabulary_size, embedding_size)(inputs)

    # Add positional encoding.
    positions = tf.range(start=0, limit=sequence_length, delta=1)
    positional_embedding = layers.Embedding(sequence_length, embedding_size)(positions)
    embedding += positional_embedding

    # Create decoders.
    for _ in range(layers_number):

        # Normalize at the beginning. Because 2018.
        x1 = layers.LayerNormalization(epsilon=1e-6)(embedding)

        # Do the attention.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_size,
            dropout=0.1 # Optional.
        )(x1, x1, x1, use_causal_mask=True)

        # Skip connection.
        x2 = embedding + attention_output

        # Normalize,
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)

        # Feed Forward. MLP.
        x4 = layers.Dense(embedding_size * 2, activation="gelu")(x3)
        x4 = layers.Dense(embedding_size, activation="gelu")(x4)

        # Skip connection.
        embedding = x2 + x4

    # Head.
    outputs = layers.Dense(vocabulary_size, activation="softmax")(embedding)

    # Functional API!
    model = models.Model(
        inputs=inputs,
        outputs=outputs
    )
    return model

model = create_transformer()
model.summary()

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    dataset_train.shuffle(10_000).batch(1024),
    epochs=10,
    validation_data=dataset_valid.batch(1024)
)
render_history(history)

Epoch 1/10


ResourceExhaustedError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.12/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.12/asyncio/base_events.py", line 645, in run_forever

  File "/usr/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once

  File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipython-input-1028589103.py", line 57, in <cell line: 0>

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 377, in fit

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 220, in function

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 133, in multi_step_on_iterator

Out of memory while trying to allocate 5259657216 bytes.
	 [[{{node StatefulPartitionedCall}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_multi_step_on_iterator_44873]