In [1]:
import requests

# Download Shakespeare's Sonnets from Project Gutenberg
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Save the raw text to a file
with open("/home/itachi/Mini-GPT/data/raw/shakespeare_sonnets.txt", "w", encoding="utf-8") as f:
    f.write(text)

In [2]:
import re

def clean_text(text):
    # Remove all non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    text = "<start> " + text + " <end>"       # Add start and end tokens
    
    return text

# Load and clean the text
with open("/home/itachi/Mini-GPT/data/raw/shakespeare_sonnets.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
cleaned_text = clean_text(raw_text)

# Save cleaned text
with open("/home/itachi/Mini-GPT/data/processed/cleaned_sonnets.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

In [3]:
# Create vocabulary and tokenizer
vocab = sorted(set(cleaned_text))
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}, {vocab}")
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Tokenize the text
def tokenize(text):
    return [char_to_idx[char] for char in text]

tokenized_text = tokenize(cleaned_text)
print(f"Tokenized text: {tokenized_text[:50]}, cleaned_text: {cleaned_text[:50]}")
# Save tokenized text and vocabulary
import json

with open("/home/itachi/Mini-GPT/data/processed/tokenized_sonnets.json", "w") as f:
    json.dump(tokenized_text, f)

with open("/home/itachi/Mini-GPT/data/processed/vocab.json", "w") as f:
    json.dump({"char_to_idx": char_to_idx, "idx_to_char": idx_to_char}, f)

Vocabulary size: 67, ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Tokenized text: [12, 59, 60, 41, 58, 60, 13, 1, 20, 49, 58, 59, 60, 1, 17, 49, 60, 49, 66, 45, 54, 10, 0, 16, 45, 46, 55, 58, 45, 1, 63, 45, 1, 56, 58, 55, 43, 45, 45, 44, 1, 41, 54, 65, 1, 46, 61, 58, 60, 48], cleaned_text: <start> First Citizen:
Before we proceed any furth


In [4]:
from sklearn.model_selection import train_test_split

# Split the tokenized text
train_data, val_data = train_test_split(tokenized_text, test_size=0.2, shuffle=False)

# Save the splits
with open("/home/itachi/Mini-GPT/data/processed/train_data.json", "w") as f:
    json.dump(train_data, f)

with open("/home/itachi/Mini-GPT/data/processed/val_data.json", "w") as f:
    json.dump(val_data, f)

In [5]:
def create_sequences(tokenized_text, seq_length):
    inputs, targets = [], []
    for i in range(len(tokenized_text) - seq_length):
        inputs.append(tokenized_text[i:i+seq_length])
        targets.append(tokenized_text[i+1:i+seq_length+1])
    return inputs, targets

# Define sequence length
seq_length = 64  # Adjust based on your dataset and memory constraints

# Create sequences for training and validation
train_inputs, train_targets = create_sequences(train_data, seq_length)
val_inputs, val_targets = create_sequences(val_data, seq_length)
print(f"Number of training sequences: {len(train_inputs)}"), print(f"Number of validation sequences: {len(val_inputs)}")
print(f"Example training input: {train_inputs[0]}"), print(f"Example training target: {train_targets[0]}")
# Save sequences
import torch

train_data = {
    "inputs": torch.tensor(train_inputs, dtype=torch.long),
    "targets": torch.tensor(train_targets, dtype=torch.long)
}
val_data = {
    "inputs": torch.tensor(val_inputs, dtype=torch.long),
    "targets": torch.tensor(val_targets, dtype=torch.long)
}

torch.save(train_data, "/home/itachi/Mini-GPT/data/processed/train_sequences.pt")
torch.save(val_data, "/home/itachi/Mini-GPT/data/processed/val_sequences.pt")

Number of training sequences: 892262
Number of validation sequences: 223018
Example training input: [12, 59, 60, 41, 58, 60, 13, 1, 20, 49, 58, 59, 60, 1, 17, 49, 60, 49, 66, 45, 54, 10, 0, 16, 45, 46, 55, 58, 45, 1, 63, 45, 1, 56, 58, 55, 43, 45, 45, 44, 1, 41, 54, 65, 1, 46, 61, 58, 60, 48, 45, 58, 6, 1, 48, 45, 41, 58, 1, 53, 45, 1, 59, 56]
Example training target: [59, 60, 41, 58, 60, 13, 1, 20, 49, 58, 59, 60, 1, 17, 49, 60, 49, 66, 45, 54, 10, 0, 16, 45, 46, 55, 58, 45, 1, 63, 45, 1, 56, 58, 55, 43, 45, 45, 44, 1, 41, 54, 65, 1, 46, 61, 58, 60, 48, 45, 58, 6, 1, 48, 45, 41, 58, 1, 53, 45, 1, 59, 56, 45]


In [6]:
# Load tokenized text
with open("/home/itachi/Mini-GPT/data/processed/tokenized_sonnets.json", "r") as f:
    tokenized_text = json.load(f)

# Load vocabulary and fix key types
with open("/home/itachi/Mini-GPT/data/processed/vocab.json", "r") as f:
    vocab = json.load(f)
    char_to_idx = vocab["char_to_idx"]
    idx_to_char = {int(k): v for k, v in vocab["idx_to_char"].items()}  # Fix keys

# Decode a sample sequence
sample_tokens = tokenized_text[:50]
sample_text = "".join([idx_to_char[idx] for idx in sample_tokens])
print("Sample text:", sample_text)

Sample text: <start> First Citizen:
Before we proceed any furth


: 