In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [23]:
import requests
res = requests.get('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')
training_text = res.text

In [24]:
training_text = training_text.lower()

In [25]:
len(training_text.split())

202651

In [26]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Metaspace

In [27]:
tkz = Tokenizer(BPE(unk_token="[UNK]"))
tkz.pre_tokenizer = Metaspace(replacement="▁")
trainer = BpeTrainer(vocab_size=10000, special_tokens=["[UNK]", "[PAD]"])
tkz.train_from_iterator([training_text], trainer=trainer)








In [28]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [29]:
output = tkz.encode(training_text)

In [30]:
training_tokens = output.ids

In [31]:
class TextDataset(Dataset):
    def __init__(self, tokens, seq_len):
        self.tokens = torch.tensor(tokens, dtype=torch.long)
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        x = self.tokens[idx : idx + self.seq_len]
        y = self.tokens[idx + 1 : idx + self.seq_len + 1]
        return x, y

In [32]:
train_loader = DataLoader(TextDataset(training_tokens, 69), batch_size=500, shuffle=True)

In [33]:
class generator_nn(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size):
        super(generator_nn, self).__init__()
        self.s_i = input_size
        self.s_h = hidden_size
        self.vocab_size = vocab_size
        self.embed_layer = nn.Embedding(vocab_size, input_size, padding_idx=1)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        vec = self.embed_layer(x)
        o, (hn, cn) = self.lstm(vec)
        logits = self.linear(o)
        return logits

def train(model, training_loader, lr, epochs, threshold=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    losses = np.zeros(epochs)
    # val_losses = np.zeros(epochs)
    # val_f1 = np.zeros(epochs)
    for e in tqdm(range(epochs), leave=False):
        model.train()
        loss_epoch = 0
        for X_batch, y_batch in training_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            logits = model(X_batch)
            # print(logits.shape, y_batch.shape)
            loss = loss_fn(logits.view(-1, logits.shape[-1]), y_batch.view(-1))
            loss.backward()
            optimizer.step()

            loss_epoch += loss.item() * X_batch.shape[0]
        losses[e] = loss_epoch / len(training_loader.dataset)

        # if val_loader:
        #     val_losses[e], val_f1[e] = validation_metrics(model, val_loader, loss_fn, threshold, device)
    return losses


In [34]:
generator = generator_nn(100, 256, tkz.get_vocab_size())
train(generator, train_loader, lr=0.0001, epochs=50)

  0%|          | 0/50 [00:00<?, ?it/s]

                                                 

array([7.82515433, 7.45389622, 7.23678058, 6.99901495, 6.77451421,
       6.56115203, 6.36136525, 6.17958904, 6.0128832 , 5.85836311,
       5.7158521 , 5.58382633, 5.4610609 , 5.34640759, 5.23871503,
       5.13693306, 5.04018636, 4.94772002, 4.85895882, 4.77355869,
       4.69122288, 4.61164621, 4.53449751, 4.45962562, 4.38678423,
       4.31580745, 4.24660757, 4.17898392, 4.11292127, 4.04825614,
       3.98489208, 3.9227714 , 3.86182134, 3.80199781, 3.74322847,
       3.68547478, 3.62869254, 3.57287734, 3.5180204 , 3.46409309,
       3.4110785 , 3.35892864, 3.30765712, 3.25725251, 3.2076316 ,
       3.15885515, 3.11085133, 3.06365805, 3.01721776, 2.97154609])

In [35]:
generator.to("cpu")
torch.save(generator, "/teamspace/studios/this_studio/DLventures/gen.pt")

In [None]:
a = np.random.rand(2,3,4)

In [None]:
a.reshape(-1)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

class ToyDataset(Dataset):
    def __init__(self, length=200_000, seq_len=69):
        self.data = torch.randint(0, 10_000, (length,), dtype=torch.long)
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = self.data[idx : idx + self.seq_len]
        y = self.data[idx + 1 : idx + self.seq_len + 1]
        return x, y

class ToyModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_size=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=1)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        return self.fc(x)

# Hyperparams
vocab_size = 10_000
batch_size = 500
seq_len = 69
epochs = 3

# Data
dataset = ToyDataset()
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ToyModel(vocab_size).to(device)
loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training
for epoch in range(epochs):
    total_loss = 0
    model.train()
    pbar = tqdm(loader, desc=f"Epoch {epoch}")
    for xb, yb in pbar:
        xb, yb = xb.to(device), yb.to(device)

        opt.zero_grad()
        out = model(xb)
        loss = loss_fn(out.reshape(-1, vocab_size), yb.reshape(-1))
        loss.backward()
        opt.step()

        total_loss += loss.item()
    print(f"Epoch {epoch}: Loss = {total_loss / len(loader):.4f}")


Epoch 0: 100%|██████████| 400/400 [01:27<00:00,  4.59it/s]


Epoch 0: Loss = 7.1092


Epoch 1:  21%|██▏       | 85/400 [00:18<01:09,  4.54it/s]


KeyboardInterrupt: 

In [None]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.get_device_name())
print(torch.cuda.is_available())

2.6.0+cu124
12.4
90300
Tesla T4
True


In [None]:
# clear torch cahe
torch.cuda.empty_cache()

In [None]:
del model

In [None]:
del generator

In [None]:
import torch, time
x = torch.randn(500, 69, 100).cuda()
lstm = torch.nn.LSTM(100, 256, batch_first=True).cuda()

start = time.time()
for _ in range(500):
    out, _ = lstm(x)
torch.cuda.synchronize()
print(f"500 LSTM forward passes took {time.time() - start:.2f} seconds")

500 LSTM forward passes took 4.96 seconds


In [None]:
!nvidia-smi

Thu Jun 12 19:19:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   68C    P0             29W /   70W |    3220MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
del x
del lstm