In [None]:
import tensorflow as tf
from transformers import GPT2Tokenizer
import requests

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# URL of the Shakespeare dataset
dataset_url = 'https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt'

# Download the dataset
response = requests.get(dataset_url)
data = response.text

# Truncate or split the text to fit within the model's maximum sequence length
max_sequence_length = 1024
truncated_text = data[:max_sequence_length]

# Print the first 100 characters of the truncated text
print("First 100 characters of the truncated text:", truncated_text[:100])

# Tokenize the truncated text
tokenized_text = tokenizer.encode(truncated_text, return_tensors='tf').numpy().flatten()

# Print the first 10 tokens of the dataset
print("First 10 tokens of the dataset:", tokenized_text[:10])

# Create input-target pairs for training
input_sequences = tokenized_text[:-1]
target_sequences = tokenized_text[1:]

batch_size = 32


# Create a TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
train_dataset = train_dataset.batch(64, drop_remainder=True)


First 100 characters of the truncated text: This is the 100th Etext file presented by Project Gutenberg, and
is presented in cooperation with Wo
First 10 tokens of the dataset: [1212  318  262 1802  400  412 5239 2393 5545  416]


In [None]:
import tensorflow as tf
from transformers import GPT2Tokenizer
import requests

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# URL of the Shakespeare dataset
dataset_url = 'https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt'

# Download the dataset
response = requests.get(dataset_url)
data = response.text

# Truncate or split the text to fit within the model's maximum sequence length
max_sequence_length = 1024
truncated_text = data[:max_sequence_length]

# Tokenize the truncated text
tokenized_text = tokenizer.encode(truncated_text, return_tensors='tf').numpy().flatten()

# Create input-target pairs for training
input_sequences = tokenized_text[:-1]
target_sequences = tokenized_text[1:]

# Determine batch size (adjust as needed)
batch_size = 64

# Calculate the number of full batches
num_batches = len(input_sequences) // batch_size
num_full_batches = num_batches * batch_size

# Keep only the full batches
input_sequences = input_sequences[:num_full_batches]
target_sequences = target_sequences[:num_full_batches]

# Reshape input sequences to have a third dimension for sequence length
input_sequences = tf.reshape(input_sequences, [batch_size, -1, 1])
target_sequences = tf.reshape(target_sequences, [batch_size, -1, 1])

# Create a TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# Define the model with an LSTM layer
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=50257, output_dim=768, batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(units=768, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(50257, activation='softmax')
])

# Compile the model
model.compile(optimizer=tf.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fine-tune the model
num_epochs = 30
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    batches = 0

    for input_batch, target_batch in train_dataset:
        # Calculate loss and accuracy for the batch during training
        with tf.GradientTape() as tape:
            predictions = model(input_batch)
            loss = tf.keras.losses.sparse_categorical_crossentropy(target_batch, predictions, from_logits=False)
            loss = tf.reduce_mean(loss)

        gradients = tape.gradient(loss, model.trainable_variables)
        model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # Compute accuracy
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.squeeze(target_batch, axis=-1), tf.argmax(predictions, axis=-1, output_type=tf.int32)), tf.float32))

        total_loss += loss.numpy()
        total_accuracy += accuracy.numpy()
        batches += 1

    if batches > 0:
        average_loss = total_loss / batches
        average_accuracy = total_accuracy / batches

        print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {average_loss:.4f} - Accuracy: {average_accuracy:.4f}")
    else:
        print(f"Epoch {epoch + 1}/{num_epochs} - No batches processed")

Epoch 1/30 - Loss: 10.8249 - Accuracy: 0.0000
Epoch 2/30 - Loss: 10.8177 - Accuracy: 0.3906
Epoch 3/30 - Loss: 10.8082 - Accuracy: 0.4531
Epoch 4/30 - Loss: 10.7967 - Accuracy: 0.4609
Epoch 5/30 - Loss: 10.7821 - Accuracy: 0.4414
Epoch 6/30 - Loss: 10.7616 - Accuracy: 0.4180
Epoch 7/30 - Loss: 10.7278 - Accuracy: 0.3359
Epoch 8/30 - Loss: 10.6585 - Accuracy: 0.1250
Epoch 9/30 - Loss: 10.4725 - Accuracy: 0.0742
Epoch 10/30 - Loss: 9.9274 - Accuracy: 0.0703
Epoch 11/30 - Loss: 8.8152 - Accuracy: 0.0703
Epoch 12/30 - Loss: 7.5854 - Accuracy: 0.0703
Epoch 13/30 - Loss: 6.5770 - Accuracy: 0.0703
Epoch 14/30 - Loss: 5.8944 - Accuracy: 0.0703
Epoch 15/30 - Loss: 5.4270 - Accuracy: 0.0703
Epoch 16/30 - Loss: 5.1618 - Accuracy: 0.0703
Epoch 17/30 - Loss: 5.0658 - Accuracy: 0.0703
Epoch 18/30 - Loss: 5.0721 - Accuracy: 0.0195
Epoch 19/30 - Loss: 5.1150 - Accuracy: 0.0195
Epoch 20/30 - Loss: 5.1566 - Accuracy: 0.0156
Epoch 21/30 - Loss: 5.1723 - Accuracy: 0.0156
Epoch 22/30 - Loss: 5.1586 - Accur

In [None]:
import os

# Save the fine-tuned model
save_dir = "./shakespeare_finetuned"
os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist
model.save_weights(os.path.join(save_dir, "model_weights.h5"))