In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset, Split
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [2]:
current_path = os.path.abspath('.')
project_name = 'TinyStoriesProject'
project_path = os.path.join(current_path.split(project_name)[0], project_name)
print(project_path)

C:\Users\peter\Documents\SJSU\DeepLearning\TinyStoriesProject


In [3]:
#Load Small Sample for testing
df = pd.read_json("./TinyStories_all_data/data00.json")

In [4]:
vocab_size = 8000

# Tokenizer

In [5]:
texts = df['story'].dropna().tolist()
# Keep only non-empty, non-whitespace-only strings
texts = [text for text in texts if len(text.strip().split()) > 0]

In [19]:
df['story'].str.len().mean()

774.86885

In [6]:
from tokenizers import BertWordPieceTokenizer

# Initialize
custom_tokenizer = BertWordPieceTokenizer()

# Train
custom_tokenizer.train_from_iterator(
    texts,
    vocab_size=vocab_size,  # You can go lower for a small model
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

In [7]:
from transformers import PreTrainedTokenizerFast

# Wrap Tokenizer
custom_tokenizerfast = PreTrainedTokenizerFast(tokenizer_object=custom_tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>"
)

In [6]:
import random

def mask_random_word(sentence):
    words = sentence.split()
    random_idx = random.randint(0, len(words) - 1)
    
    words[random_idx] = '[MASK]'
    return ' '.join(words)

masked_sentences = [mask_random_word(sentence) for sentence in texts]

In [None]:
# Tokenize your masked dataset
#encodings = custom_tokenizerfast(masked_sentences, padding=True)
encodings = custom_tokenizerfast(masked_sentences, padding=True, max_length=512)

# Model

In [47]:
class SimpleMLM(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, num_heads, max_len):
        super(SimpleMLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.position_embeddings = nn.Embedding(2048, hidden_dim)  # generous upper bound
        #max_len = custom_tokenizerfast.model_max_length  # Often 512 for many models
        #self.position_embeddings = nn.Embedding(max_len, hidden_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.lm_head = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        seq_length = input_ids.size(1)
        positions = torch.arange(0, seq_length, device=input_ids.device).unsqueeze(0)
        x = self.embedding(input_ids) + self.position_embeddings(positions)
        
        x = self.transformer_encoder(x)  # shape: [batch_size, seq_len, hidden_dim]
        logits = self.lm_head(x)
        return logits

In [9]:
def mask_tokens(inputs, tokenizer, mask_prob=0.15):
    labels = inputs.clone()
    mask_token_id = tokenizer.convert_tokens_to_ids("<mask>")
    probability_matrix = torch.full(labels.shape, mask_prob)
    special_tokens_mask = torch.zeros_like(inputs).bool()
    
    masked_indices = torch.bernoulli(probability_matrix).bool() & ~special_tokens_mask
    labels[~masked_indices] = -100  # Only compute loss on masked tokens

    # Replace 80% with [MASK], 10% with random token, 10% unchanged
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = mask_token_id

    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    return inputs, labels

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Sample toy data
# Assume tokenizer already gives input_ids
tokenized = custom_tokenizerfast(texts, 
                       padding=True, max_length=512)
input_ids = torch.tensor(tokenized["input_ids"])

In [20]:
input_ids, labels = mask_tokens(input_ids, custom_tokenizerfast)

In [23]:
dataset = TensorDataset(input_ids, labels)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [48]:
# Instantiate model
model = SimpleMLM(vocab_size=len(custom_tokenizerfast), hidden_dim=128, num_layers=2, num_heads=4, max_len=20)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

In [34]:
print("Input shape:", input_ids.shape)
print("Max position index:", input_ids.shape[1] - 1)
print("Position embedding size:", self.position_embeddings.num_embeddings)

Input shape: torch.Size([2, 1053])
Max position index: 1052


NameError: name 'self' is not defined

In [55]:
from tqdm import tqdm  # Progress bar

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc=f"Epoch {epoch + 1}/{5}"):
        input_ids, labels = batch
        logits = model(input_ids)
        loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch + 1} average loss: {avg_loss:.4f}, Loss: {loss.item():.4f}")

Epoch 1/5:   0%|                                                                 | 35/49999 [00:28<11:23:12,  1.22it/s]


KeyboardInterrupt: 