In [2]:
from datasets import load_dataset
from itertools import islice

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
DATASET_NAME = "EleutherAI/the_pile_deduplicated"
NUM_SAMPLES_TO_LOAD = 100_000 

print(f"Loading dataset stream...")
pile_stream = load_dataset(DATASET_NAME, split='train', streaming=True)

print(f"Loading first {NUM_SAMPLES_TO_LOAD:,} samples into memory...")
subset_iterable = islice(pile_stream, NUM_SAMPLES_TO_LOAD)

pile_subset_list = list(subset_iterable)

print(f"Loaded {len(pile_subset_list):,} samples.")


Loading dataset stream...
Loading first 100,000 samples into memory...
Loaded 100,000 samples.


In [None]:
# Cell 2: Define standardize_text function
import re

def standardize_text(text):
  if not isinstance(text, str):
    return ""
  text = text.lower()
  text = re.sub(r'([.?!])', r' \1 ', text)
  text = re.sub(r"[^a-z0-9\s'.?!]", '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [5]:

print(f"Standardizing {len(pile_subset_list):,} documents...")
standardized_pile_subset = []

for doc in pile_subset_list:
    original_text = doc.get('text', '')
    standardized_text = standardize_text(original_text)
    standardized_pile_subset.append({'text': standardized_text, 'meta': doc.get('meta')})


print(f"Standardization complete. Created list with {len(standardized_pile_subset)} documents.")


Standardizing 100,000 documents...
Standardization complete. Created list with 100000 documents.


In [6]:
# Cell 4: Define tokenize function

def tokenize(text):
    if not isinstance(text, str):
        return []
    return text.split()

In [7]:
print(f"Tokenizing {len(standardized_pile_subset):,} documents...")
tokenized_pile_subset = []

for doc in standardized_pile_subset:
    standardized_text = doc.get('text', '')
    tokens = tokenize(standardized_text)
    tokenized_pile_subset.append({'tokens': tokens, 'meta': doc.get('meta')})

print(f"Tokenization complete. Created list with {len(tokenized_pile_subset)} documents.")

Tokenizing 100,000 documents...
Tokenization complete. Created list with 100000 documents.


In [8]:
from collections import Counter
import itertools

VOCAB_SIZE = 30000
UNK_TOKEN = "<UNK>"

all_tokens_iterator = itertools.chain.from_iterable(doc.get('tokens', []) for doc in tokenized_pile_subset)
word_counts = Counter(all_tokens_iterator)
most_common_tokens = word_counts.most_common(VOCAB_SIZE)

word_to_id = {UNK_TOKEN: 0}
current_id = 1

for token, count in most_common_tokens:
    if token not in word_to_id:
        word_to_id[token] = current_id
        current_id += 1


id_to_word = {id: word for word , id in word_to_id.items()}
actual_vocab_size = word_to_id



In [9]:
print(f"Vectorizing {len(tokenized_pile_subset):,} documents...")

vectorized_data = []

unknown_token_id = word_to_id[UNK_TOKEN]

for doc in tokenized_pile_subset:
    tokens = doc.get('tokens', [])

    ids = [word_to_id.get(token, unknown_token_id) for token in tokens]
    vectorized_data.append({'ids': ids, 'meta': doc.get('meta')})

print(f"Vectorization complete. Created list with {len(vectorized_data)} documents.")

if vectorized_data:
    print("\n--- Original Tokens (First 50) ---")
    print(tokenized_pile_subset[0].get('tokens', [])[:50])
    print("\n--- Vectorized IDs (First 50) ---")
    print(vectorized_data[0].get('ids', [])[:50])

Vectorizing 100,000 documents...
Vectorization complete. Created list with 100000 documents.

--- Original Tokens (First 50) ---
['it', 'is', 'done', 'and', 'submitted', '.', 'you', 'can', 'play', 'survival', 'of', 'the', 'tastiest', 'on', 'android', 'and', 'on', 'the', 'web', '.', 'playing', 'on', 'the', 'web', 'works', 'but', 'you', 'have', 'to', 'simulate', 'multitouch', 'for', 'table', 'moving', 'and', 'that', 'can', 'be', 'a', 'bit', 'confusing', '.', 'theres', 'a', 'lot', 'id', 'like', 'to', 'talk', 'about']

--- Vectorized IDs (First 50) ---
[12, 8, 298, 4, 2825, 1, 15, 39, 357, 3777, 5, 2, 0, 14, 1580, 4, 14, 2, 813, 1, 889, 14, 2, 813, 655, 29, 15, 25, 3, 12353, 0, 10, 465, 1128, 4, 9, 39, 19, 6, 435, 7209, 1, 1039, 6, 233, 290, 68, 3, 626, 47]


In [10]:
# Cell 8: Save Vectorized Data to File
import json
import time
import os

# Output file for vectorized data (Update count if needed, e.g., 100k)
output_filename_vec = "pile_subset_vectorized_100k.jsonl" # <<< MAKE SURE FILENAME/COUNT IS CORRECT
output_path_vec = os.path.join(".", output_filename_vec)

# Assume 'vectorized_data' exists and has content
if 'vectorized_data' in locals() and vectorized_data:
    print(f"\nSaving {len(vectorized_data):,} vectorized documents to {output_path_vec}...")
    start_time = time.time()

    # Open the file and write each document as a JSON line
    with open(output_path_vec, 'w', encoding='utf-8') as f:
        for doc in vectorized_data:
            json_record = json.dumps(doc)
            f.write(json_record + '\n')

    end_time = time.time()
    print(f"Vectorized data successfully saved. Time taken: {end_time - start_time:.2f} seconds.")
    print(f"Data saved to: {output_path_vec}")
else:
    print("\nError: 'vectorized_data' not found or empty. Cannot save.")


Saving 100,000 vectorized documents to .\pile_subset_vectorized_100k.jsonl...
Vectorized data successfully saved. Time taken: 5.57 seconds.
Data saved to: .\pile_subset_vectorized_100k.jsonl


In [4]:
import json
import os

window_size = 20
max_pair_per_batch = 100_000
OUTPUT_FILE = "pile_training_windows_100k.jsonl"

input_output_pairs = []

print(f"Loading vectorized data from 'pile_subset_vectorized_100k.jsonl'...")


with open("pile_subset_vectorized_100k.jsonl", "r", encoding="utf-8") as f, \
    open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:

    for line_num, line in enumerate(f, start=1):
        doc = json.loads(line)
        token_ids = doc.get("ids", [])  
        if len(token_ids) <= window_size:
            continue

        for i in range(len(token_ids) - window_size):
            X = token_ids[i: i + window_size]
            y = token_ids[i + window_size]
            input_output_pairs.append({"X": X, "y": y})

            if len(input_output_pairs) >= max_pair_per_batch:
                for pair in input_output_pairs:
                    out_f.write(json.dumps(pair) + "\n")

                input_output_pairs = []

        if line_num % 500 == 0:
            print(f"Processed {line_num} documents...")

    for pair in input_output_pairs:
        out_f.write(json.dumps(pair) + "\n")

print(f"✅ Sliding window dataset saved as {OUTPUT_FILE}")

Loading vectorized data from 'pile_subset_vectorized_100k.jsonl'...
Processed 500 documents...
Processed 1000 documents...
Processed 1500 documents...
Processed 2000 documents...
Processed 2500 documents...
Processed 3000 documents...
Processed 3500 documents...
Processed 4000 documents...
Processed 4500 documents...
Processed 5000 documents...
Processed 5500 documents...
Processed 6000 documents...
Processed 6500 documents...
Processed 7000 documents...
Processed 7500 documents...
Processed 8000 documents...
Processed 8500 documents...
Processed 9000 documents...
Processed 9500 documents...
Processed 10000 documents...
Processed 10500 documents...
Processed 11000 documents...
Processed 11500 documents...
Processed 12000 documents...
Processed 12500 documents...
Processed 13000 documents...
Processed 13500 documents...
Processed 14000 documents...
Processed 14500 documents...
Processed 15000 documents...
Processed 15500 documents...
Processed 16000 documents...
Processed 16500 document

In [1]:
import json

file_path = "pile_training_windows_100k.jsonl"

with open(file_path, "r", encoding="utf-8") as f:
    for i in range(3):
        print(json.loads(f.readline()))


{'X': [12, 8, 298, 4, 2825, 1, 15, 39, 357, 3777, 5, 2, 0, 14, 1580, 4, 14, 2, 813, 1], 'y': 889}
{'X': [8, 298, 4, 2825, 1, 15, 39, 357, 3777, 5, 2, 0, 14, 1580, 4, 14, 2, 813, 1, 889], 'y': 14}
{'X': [298, 4, 2825, 1, 15, 39, 357, 3777, 5, 2, 0, 14, 1580, 4, 14, 2, 813, 1, 889, 14], 'y': 2}


In [2]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

PyTorch version: 2.5.1+cu121
CUDA available: True
Device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [4]:
import torch
import json
from torch.utils.data import Dataset, DataLoader

class SlidingWindowDataset(Dataset):
    def __init__(self, file_path):
        self.file_path = file_path
        with open(file_path, "r", encoding="utf-8") as f:
            self.num_samples = sum(1 for _ in f)

    

    def __len__(self):
        return self.num_samples
    
    def get_item(self, idx):
        with open(self.file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if i == idx:
                    item = json.loads(line)
                    X = torch.tensor(item["X"], dtype   =torch.long)
                    y = torch.tensor(item["y"], dtype=torch.long)
                    return X, y
        
    

dataset = SlidingWindowDataset("data/pile_training_windows_100k.jsonl")

train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class LSTMModel(nn.module):
    def __init__(self,  vocab_size, embed_dim = 256, hidden_dim = 512, num_layers = 2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first = True)

    
    def forward(self, x):
        x = self.embedding(x)
        out,_ = self.lstm(x)
        logits = self.fc(out)
        return logits
