In [None]:
import sys
print(sys.executable)

In [None]:
import sys
import os
import json

# Get the parent directory of the current notebook
base_folder = os.path.abspath("..")
print(f"Your base folder is: {base_folder}")
sys.path.append(base_folder)

In [None]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device("cpu") # If everything else fails
if torch.cuda.is_available():
    device = torch.device("cuda") # CUDA Enabled Devices
if torch.backends.mps.is_available():
    device = torch.device("mps") # Apple Sillicon GPU

print(f"Using device: {device}")

import multiprocessing
from tqdm import tqdm
import numpy as np

from data import get_wikitext_data, get_fineweb_data, save_data, load_data, clean_textdata
from tokenization import get_tiktoken_tokenizer
import matplotlib.pyplot as plt

In [None]:
DATA_PATH = f"{base_folder}/data/tiktoken_tokenized_wikitext"
num_cores = multiprocessing.cpu_count()

tokenizer = get_tiktoken_tokenizer()
dataset = get_wikitext_data()
# save_data(dataset, DATA_PATH)

print(tokenizer)
print(dataset)

In [None]:
def clean_batch(examples):
    cleaned_texts = [clean_textdata(text) for text in examples["text"]]
    return {"text": cleaned_texts}

# Clean the entire dataset
cleaned_dataset = dataset.map(
    clean_batch,
    batched=True,
    batch_size=1000,
    num_proc=num_cores,
    desc="Cleaning text"
)

dataset["train"][0]["text"][:100]
cleaned_dataset["train"][0]["text"][:100]

print(cleaned_dataset)


In [None]:
print(cleaned_dataset["test"]["text"])
import numpy as np

def analyze_lengths(examples, tokenizer):
    tokens = [tokenizer.encode(text) for text in examples["text"]]
    return {"lengths": [len(token) for token in tokens]}

length_dataset = cleaned_dataset.map(
    analyze_lengths,
    fn_kwargs={"tokenizer": tokenizer},
    batched=True,
    batch_size=1000,
    num_proc=num_cores,
    desc="Analyzing sequence lengths"
)

max_lengths_zeros = [item for item in length_dataset["train"]["lengths"]]
max_lengths_non_zeros = [item for item in length_dataset["train"]["lengths"] if item not in [0]]

lengths_array = np.array(max_lengths_non_zeros)
lengths_array_zeros = np.array(max_lengths_zeros)
print(f"Min length: {np.min(lengths_array)}")
print(f"Max length: {np.max(lengths_array)}")
print(f"Average length: {np.mean(lengths_array):.2f}")
print(f"Median length: {np.median(lengths_array):.2f}")
print(f"90th percentile: {np.percentile(lengths_array, 90):.0f}")
print(f"95th percentile: {np.percentile(lengths_array, 95):.0f}")
print(f"99th percentile: {np.percentile(lengths_array, 99.9995):.0f}")
print(f"Number of sequences: {len(lengths_array)}")

bin_size = 100
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

axs[0].hist(lengths_array, bins=bin_size)
axs[0].set_title('Distribution of Sequence Lengths Without Zeros')
axs[0].set_xlabel('Length')
axs[0].set_ylabel('Count')

axs[1].hist(lengths_array_zeros, bins=bin_size)
axs[1].set_title('Distribution of Sequence Lengths With Zeros')
axs[1].set_xlabel('Length')
axs[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
def tokenize_batch(examples, tokenizer, max_length=1024):
    tokens = [tokenizer.encode(text) for text in examples["text"]]
    padded_tokens = []
    for seq in tokens:
        if len(seq) > max_length:
            padded_tokens.append(seq[:max_length])
        else:
            padded_tokens.append(seq + [0] * (max_length - len(seq)))

    return {"tokens": padded_tokens}

tokenized_dataset = cleaned_dataset.map(
    tokenize_batch,
    fn_kwargs={"tokenizer": tokenizer},
    batched=True,
    batch_size=1000,
    num_proc=num_cores,
    remove_columns=["text"],
    desc="Tokenizing"
)

tokenized_dataset_text = tokenized_dataset.filter(lambda x: any(token != 0 for token in x["tokens"]))


In [None]:
print(tokenized_dataset_text)

# data = torch.stack([torch.tensor(tokens, dtype=torch.long) for tokens in tqdm(tokenized_dataset_text["train"]["tokens"], desc="Loading data")])

tokens_array = np.array(tokenized_dataset_text["train"]["tokens"], dtype=np.int64)
data = torch.from_numpy(tokens_array)

print(data.shape, data.dtype)
print(data[:100])

In [None]:
batch_size = 64  # Kept the same; could be adjusted based on hardware
block_size = 1024  # GPT-2 uses a context length of 1024 tokens
max_iters = 50000  # More iterations needed for larger models
eval_interval = 1000  # Increase since more iterations are done
learning_rate = 5e-5  # GPT-2 uses a lower learning rate
eval_iters = 500  # More frequent evaluation for stability

n_embd = 768  # GPT-2 uses 768 for the small version, 1024 for medium, 1280 for large, 1600 for XL
n_head = 12  # GPT-2 uses 12 attention heads
n_layer = 12  # GPT-2 has 12 transformer blocks in the small version
dropout = 0.1  # GPT-2 uses 0.1 dropout for better generalization
