In [1]:
import re


In [2]:
text = """Taehyung is trying to divide 21/11 liters (L) of water into 7/11 liters (L) per cup, and Hoseok is trying to divide 8/17 liters (L) of water into 2/17 liters (L) per cup. How many cups will Taehyung and Hoseok need in order to divide all the water they each have into the cups?"""

In [3]:
tokens = [i for i in re.split('(\s|[^a-zA-Z])', text) if i and not i.isspace()]
print(tokens)

['Taehyung', 'is', 'trying', 'to', 'divide', '2', '1', '/', '1', '1', 'liters', '(', 'L', ')', 'of', 'water', 'into', '7', '/', '1', '1', 'liters', '(', 'L', ')', 'per', 'cup', ',', 'and', 'Hoseok', 'is', 'trying', 'to', 'divide', '8', '/', '1', '7', 'liters', '(', 'L', ')', 'of', 'water', 'into', '2', '/', '1', '7', 'liters', '(', 'L', ')', 'per', 'cup', '.', 'How', 'many', 'cups', 'will', 'Taehyung', 'and', 'Hoseok', 'need', 'in', 'order', 'to', 'divide', 'all', 'the', 'water', 'they', 'each', 'have', 'into', 'the', 'cups', '?']


In [4]:
!pip install datasets





In [5]:
from datasets import load_dataset

ds = load_dataset("microsoft/orca-math-word-problems-200k")

In [6]:
ds['train'].to_json("dataset_question.json")

Creating json from Arrow format:   0%|          | 0/201 [00:00<?, ?ba/s]

236118589

In [7]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 200035
    })
})


In [8]:
from datasets import load_dataset

ds = load_dataset('json', data_files={'train': 'dataset_question.json'})

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import re

def tokenizer(text):
    tokens = [i for i in re.split(r'(\s|[^a-zA-Z])', text) if i and not i.isspace()]
    return tokens

def build_vocab(dataset):
    vocab = {'EOQ'}
    count = 0
    for string in dataset['train']['question']:
        if not string:
            continue

        tokens = tokenizer(string.lower())
        vocab.update(tokens)

    vocab = sorted(vocab)
    token_to_idx = {token: idx for idx, token in enumerate(vocab)}
    idx_to_token = {idx: token for token, idx in token_to_idx.items()}
    return token_to_idx, idx_to_token


token_to_idx, idx_to_token = build_vocab(ds)
vocab_size = len(token_to_idx)
print("Vocabulary size:", vocab_size)

Vocabulary size: 21644


In [10]:
print(token_to_idx['jungkook'])

13659


In [11]:
class TokenizedDataset(Dataset):
    def __init__(self, text_data, token_to_idx, context_size=5):
        self.samples = []
        self.context_size = context_size
        self.token_to_idx = token_to_idx

        counter = 0

        for text in text_data:
            if not text:
                continue

            counter += 1
            if counter % 10000 == 0:
                print(f'{counter*100/len(text_data)}% processed')

            tokens = tokenizer(text.strip().lower())
            indexed_tokens = [self.token_to_idx.get(token, self.token_to_idx['EOQ']) for token in tokens]
            indexed_tokens = [self.token_to_idx['EOQ']] * self.context_size + indexed_tokens + [self.token_to_idx['EOQ']]

            for i in range(context_size - 1, len(indexed_tokens) - 1):
                context = indexed_tokens[i - context_size + 1: i + 1]
                target = indexed_tokens[i + 1]
                self.samples.append((context, target))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        context, target = self.samples[idx]
        return torch.tensor(context), torch.tensor(target)

In [12]:
# HYPER PARAMS
context_size = 5
emb_dim = 64
activation = 'relu'

In [13]:
context_size

5

In [14]:
train_texts = ds['train']['question']
tokenized_dataset = TokenizedDataset(train_texts, token_to_idx, context_size=context_size)
train_loader = DataLoader(tokenized_dataset, batch_size=4096, shuffle=True)

4.999125153098208% processed
9.998250306196416% processed
14.997375459294624% processed
19.996500612392833% processed
24.995625765491038% processed
29.994750918589247% processed
34.99387607168745% processed
39.993001224785665% processed
44.99212637788387% processed
49.991251530982076% processed
54.99037668408029% processed
59.989501837178494% processed
64.9886269902767% processed
69.9877521433749% processed
74.98687729647311% processed
79.98600244957133% processed
84.98512760266954% processed
89.98425275576774% processed
94.98337790886595% processed
99.98250306196415% processed


In [15]:
len(tokenized_dataset)

11892783

In [16]:
for context, target in train_loader:
    print("Context:", context)
    print("Target:", target)
    break

Context: tensor([[20342, 11552,  3788, 18250,  8917],
        [18364,    25, 11105, 16171,  7070],
        [ 2175, 11726,  8917,  7070,    25],
        ...,
        [20342, 15964,  8833, 17158,  8072],
        [20342,  9047,  9176,  8183,  3550],
        [ 6912,  8318,   513,   427, 16171]])
Target: tensor([ 3718,    25, 11105,  ..., 16920, 20337, 16920])


In [17]:
train_loader.dataset

<__main__.TokenizedDataset at 0x7f5e77028bb0>

In [18]:
# import torch
# from torch.utils.data import DataLoader, Subset

# torch.manual_seed(31)
# subset_size = int(0.1 * len(train_loader.dataset))

# subset_indices = torch.randperm(len(train_loader.dataset))[:subset_size]

# subset = Subset(train_loader.dataset, subset_indices)

# subset_loader = DataLoader(subset, batch_size=4096, shuffle=True, num_workers=2, pin_memory=True)

# for context, target in subset_loader:
#     print("Context:", context)
#     print("Target:", target)
#     print(target.shape)
#     print(context.shape)
#     break


In [19]:
target.shape, target.dtype, context.shape, context.dtype

(torch.Size([4096]), torch.int64, torch.Size([4096, 5]), torch.int64)

In [20]:
torch.cuda.is_available()

True

In [21]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

In [22]:
device = get_default_device()
device

device(type='cuda')

In [23]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)



In [24]:
train_loader = DeviceDataLoader(train_loader, device)
# subset_loader = DeviceDataLoader(subset_loader, device)

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim

vocab_size = len(token_to_idx)
num_epochs = 15
learning_rate = 0.001

class NextWord(nn.Module):
    def __init__(self, context_size, vocab_size=len(token_to_idx), emb_dim=64, hidden_size=256, activation='sine'):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(context_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, hidden_size)
        self.lin3 = nn.Linear(hidden_size, hidden_size)
        self.lin4 = nn.Linear(hidden_size, vocab_size)
        self.activation = activation

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        if self.activation == 'sine':
            x = torch.sin(self.lin1(x))
            x = torch.sin(self.lin2(x))
            x = torch.sin(self.lin3(x))
        elif self.activation == 'relu':
            x = nn.functional.relu(self.lin1(x))
            x = nn.functional.relu(self.lin2(x))
            x = nn.functional.relu(self.lin3(x))
        x = self.lin4(x)
        return x

In [26]:
# !pip install triton --upgrade

In [27]:
model = NextWord(context_size, vocab_size, emb_dim, hidden_size=512, activation=activation)
to_device(model, device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [28]:
for param_name, param in model.named_parameters():
    print(param_name, param.shape)

emb.weight torch.Size([21644, 64])
lin1.weight torch.Size([512, 320])
lin1.bias torch.Size([512])
lin2.weight torch.Size([512, 512])
lin2.bias torch.Size([512])
lin3.weight torch.Size([512, 512])
lin3.bias torch.Size([512])
lin4.weight torch.Size([21644, 512])
lin4.bias torch.Size([21644])


In [29]:
def reconstruct_text(tokens):
    result = []
    print(tokens)

    for i, token in enumerate(tokens):
        result.append(token)

        if i < len(tokens) - 1:
            if token.isalpha() and tokens[i + 1].isalpha():
                result.append(" ")
            elif token.isalpha() and tokens[i + 1].isdigit():
                result.append(" ")
            elif token.isdigit() and tokens[i + 1].isalpha():
                result.append(" ")
            elif token in [',','.','!','?',':','%','&'] and tokens[i + 1].isalnum():
                result.append(" ")

    return ''.join(result)

In [30]:
# checkpoint = torch.load('/content/next_word_model_emb64_cont10_relu_layernorm_batchnorm_hidden512.pth', weights_only=True)
# model.load_state_dict(checkpoint['model_state_dict'], strict=False)
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']

In [31]:
# def compare_weights(model1, model2):
#     for (name1, param1), (name2, param2) in zip(model1.items(), model2.items()):
#         if name1 != name2:
#             print(f"Mismatch in parameter names: {name1} vs {name2}")
#             return False

#         # Use torch.allclose to check if tensors are nearly identical
#         if not torch.allclose(param1, param2, atol=1e-6):
#             print(f"Mismatch found in layer: {name1}")
#             return False

#     print("All weights match!")
#     return True

# compare_weights(model.state_dict(), old_model)


In [32]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for i, (context, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(context)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if i % 100 == 0:
            print(f"Batch {i}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} Average Loss: {avg_loss:.4f}")

Epoch 1/15


Batch 0, Loss: 9.9844
Batch 100, Loss: 5.1179
Batch 200, Loss: 4.6700
Batch 300, Loss: 4.3862
Batch 400, Loss: 4.2689
Batch 500, Loss: 4.1547
Batch 600, Loss: 4.0234
Batch 700, Loss: 3.9107
Batch 800, Loss: 3.8488
Batch 900, Loss: 3.7860
Batch 1000, Loss: 3.7830
Batch 1100, Loss: 3.6536
Batch 1200, Loss: 3.6501
Batch 1300, Loss: 3.6166
Batch 1400, Loss: 3.5084
Batch 1500, Loss: 3.5253
Batch 1600, Loss: 3.5141
Batch 1700, Loss: 3.4825
Batch 1800, Loss: 3.3861
Batch 1900, Loss: 3.3909
Batch 2000, Loss: 3.3072
Batch 2100, Loss: 3.2982
Batch 2200, Loss: 3.3028
Batch 2300, Loss: 3.3392
Batch 2400, Loss: 3.2424
Batch 2500, Loss: 3.2639
Batch 2600, Loss: 3.2054
Batch 2700, Loss: 3.2320
Batch 2800, Loss: 3.2138
Batch 2900, Loss: 3.1308
Epoch 1 Average Loss: 3.7108
Epoch 2/15
Batch 0, Loss: 3.0589
Batch 100, Loss: 3.0550
Batch 200, Loss: 3.1506
Batch 300, Loss: 2.9994
Batch 400, Loss: 3.0450
Batch 500, Loss: 3.0101
Batch 600, Loss: 2.9976
Batch 700, Loss: 3.0111
Batch 800, Loss: 2.9537
Batch 90

In [None]:
# old_model = model.state_dict()

In [None]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, 'next_word_model_emb64_cont5_relu_hidden512.pth')

In [None]:
import torch

def generate_text(model, start_text, token_to_idx, idx_to_token, context_size=5, max_length=50):
    model.eval()
    tokens = tokenizer(start_text.lower())
    input_indices = [token_to_idx.get(token, token_to_idx['EOQ']) for token in tokens]

    if len(input_indices) < context_size:
        input_indices = [token_to_idx['EOQ']] * (context_size - len(input_indices)) + input_indices

    context = input_indices[-context_size:]
    generated_tokens = tokens


    with torch.no_grad():

        for _ in range(max_length):
            context_tensor = torch.tensor(context, device=device).unsqueeze(0)

            output_logits = model(context_tensor)
            predicted_token_idx = torch.distributions.categorical.Categorical(logits=output_logits).sample().item()

            predicted_token = idx_to_token.get(predicted_token_idx, '<unk>')
            generated_tokens.append(predicted_token)

            context = context[1:] + [predicted_token_idx]

            if predicted_token == 'EOQ':
                break
    return reconstruct_text(generated_tokens)
#     return ' '.join(generated_tokens)

start_text = "Ramesh has 5 pencils and 10 pens with him"
generated_text = generate_text(model, start_text, token_to_idx, idx_to_token, context_size=5, max_length=50)
print(generated_text)


['ramesh', 'has', '5', 'pencils', 'and', '1', '0', 'pens', 'with', 'him', '.', 'the', 'ratio', 'of', 'marbles', 'to', '.', 'mark', 'now', 'has', 'a', '4', '0', '%', 'chance', 'that', 'she', 'gets', 'jamie', 'and', 'ends', 'up', 'her', 'three', 'thirds', 'of', 'the', 'raspberries', 'as', 'many', 'blueberries', 'as', 'oranges', '.', 'she', 'has', '4', 'pounds', 'of', 'oranges', '.', 'how', 'many', 'peanuts', 'does', 'kimberly', 'have', 'initially', 'in', 'the']
ramesh has 5 pencils and 10 pens with him. the ratio of marbles to. mark now has a 40% chance that she gets jamie and ends up her three thirds of the raspberries as many blueberries as oranges. she has 4 pounds of oranges. how many peanuts does kimberly have initially in the
