In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-11-06 23:48:47--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2024-11-06 23:48:47 (29.3 MB/s) - 'input.txt' saved [1115394/1115394]



In [2]:
import torch 

In [3]:
with open('/kaggle/working/input.txt') as f:
    text = f.read()

In [4]:
unique = sorted(list(set(text)))
vocab_size = len(unique)

In [5]:
it = {i: t for i, t in enumerate(unique)}
ti = {t: i for i, t in enumerate(unique)}

encode = lambda input: [ti[ch] for ch in input]
decode = lambda input: ''.join([it[ch] for ch in input])

In [6]:
data = torch.tensor(encode(text), dtype = torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [41]:
len(val_data) // batch_size

435

In [42]:
#val_iters = len(val_data) // batch_size
val_iters = 400

@torch.no_grad()
def estimate_loss():
    model.eval()
    losses = 0
    for iter in range(val_iters):
        x, y = get_batch('val')
        x, y = x.to(device), y.to(device)
        out, loss = model(x, y)
        losses += loss.item()
    model.train()
    return losses / val_iters

In [177]:
time_length = 10
batch_size = 4

def get_batch(split):
    if split == 'train':
        data = train_data
    else:
        data = val_data
    idx = torch.randint(0, len(data) - time_length, size=(batch_size, 1))
    x = torch.stack([data[ix:ix + time_length] for ix in idx])
    y = torch.stack([data[ix + 1:ix + 1 + time_length] for ix in idx])

    return x, y

In [8]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

embd_dim = vocab_size
n_embd = vocab_size
lr = 1e-3
iters = 10000
val_interval = 1000

class BgramRNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.embd = nn.Embedding(n_embd, embd_dim)

    def forward(self, x, target=None):
        # x -> (B, T), target -> (B, T)
        logits = self.embd(x) # -> (B, T, C)

        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # -> (B*T, C)
            target = target.view(B*T) # -> (B*T, 1)
            loss = F.cross_entropy(logits, target)

        return logits, loss

    def generate(self, idx, max_length):
        # idx -> (1, 1)
        for t in range(max_length):
            logits, _ = self(idx)
            logits = logits[:,-1,:]
            probs = logits.softmax(dim=-1)
            pred = torch.multinomial(probs, 1) # -> (1,1)
            idx = torch.cat([idx, pred], dim=1)

        return idx
        
        

In [259]:
model = BgramRNN()

In [180]:
context = torch.zeros((1,1), dtype=torch.long)
decode(model.generate(context, max_length=100)[0].tolist())

'\nw\nIw-?gg?gRfbHby:NkY.ouOYehntMe\nugn?q$QyOPyWua$zz$pljoQSZbz:rIw&&RfPSN&c-ZsSh $Oqp$LevXF,giNqBa LnjJ'

In [181]:
optim = torch.optim.Adam(model.parameters(), lr=lr)
for iter in range(iters):
    x, y = get_batch('train')
    out, loss = model(x, y)
    if iter % val_interval == 0 or iter == iters-1:
        val_loss = estimate_loss()
        print(f"Iter number: {iter}, train loss value {loss.item():.4f}, val loss value {val_loss:.4f}")
    model.zero_grad()
    loss.backward()
    optim.step()

Iter number: 0, train loss value 4.3925, val loss value 4.5356
Iter number: 1000, train loss value 3.9628, val loss value 3.8629
Iter number: 2000, train loss value 3.4913, val loss value 3.3996
Iter number: 3000, train loss value 3.0454, val loss value 3.0905
Iter number: 4000, train loss value 2.8103, val loss value 2.8936
Iter number: 5000, train loss value 2.3692, val loss value 2.7622
Iter number: 6000, train loss value 2.7632, val loss value 2.6727
Iter number: 7000, train loss value 2.8618, val loss value 2.6221
Iter number: 8000, train loss value 2.1723, val loss value 2.5860
Iter number: 9000, train loss value 2.4199, val loss value 2.5470
Iter number: 9999, train loss value 2.4077, val loss value 2.5242


In [182]:
context = torch.zeros((1,1), dtype=torch.long)
decode(model.generate(context, max_length=100)[0].tolist())

'\nALI a!zzleie bictmy,iand t oldofathibTHe hey Clle: nonthe the at mfew-MAllde thern-rd?dent:\nNTicine '

In [434]:
time_length = 25
batch_size = 16

def get_batch(split):
    if split == 'train':
        data = train_data
    else:
        data = val_data
    idx = torch.randint(0, len(data) - time_length, size=(batch_size, 1))
    x = torch.stack([data[ix:ix + time_length] for ix in idx])
    y = torch.stack([data[ix + 1:ix + 1 + time_length] for ix in idx])

    return x, y

In [370]:
embd_dim = 128
n_embd = vocab_size
hidden_size = 128
lr = 3e-4
iters = 100000
val_iters = 1000
n_layer = 3
dropout = 0.0

class BgramRNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.embd = nn.Embedding(n_embd, embd_dim)
        self.rnn = nn.RNN(embd_dim, hidden_size, batch_first=True, num_layers=n_layer, nonlinearity='tanh', dropout=dropout)
        self.linear = nn.Linear(hidden_size, n_embd, bias=False)

    def forward(self, x, target=None):
        # x -> (B, T), target -> (B, T)
        x = self.embd(x) # -> (B, T, embd_dim)
        output, h_t = self.rnn(x) # output -> (B, T, hidden), h_t -> (B, hidden)
        logits = self.linear(output) # -> (B, T, n_embd)

        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # -> (B*T, C)
            target = target.view(B*T) # -> (B*T, 1)
            loss = F.cross_entropy(logits, target)

        return logits, loss

    def generate(self, idx, max_length, index=0):
        # idx -> (1, 1)
        if index == 0: #The fastest
            h_t = None
            idx_prod = idx
            for t in range(max_length): ### The fastest
                x = self.embd(idx_prod) # -> (B, T, embd_dim)
                output, h_t = self.rnn(x, h_t) # output -> (B, T, hidden), h_t -> (B, hidden)
                logits = self.linear(h_t[-1])
                probs = logits.softmax(dim=-1)
                pred = torch.multinomial(probs, 1) # -> (1,1)
                idx = torch.cat([idx, pred], dim=1)
                idx_prod = torch.unsqueeze(idx[:, -1], 0)
        elif index == 1: ##### VERY SLOW
            for t in range(max_length):      
                logits, _ = self(idx) # logits -> 1, 1, (vocab_size)
                logits = logits[:,-1,:]
                probs = logits.softmax(dim=-1)
                pred = torch.multinomial(probs, 1) # -> (1,1)
                idx = torch.cat([idx, pred], dim=1)
        elif index == 2: ### Has Best perfomance
            for t in range(max_length):  
                idx_cond = idx[:, -time_length:]
                logits, _ = self(idx_cond) # logits -> 1, 1, (vocab_size)
                logits = logits[:,-1,:]
                probs = logits.softmax(dim=-1)
                pred = torch.multinomial(probs, 1) # -> (1,1)
                idx = torch.cat([idx, pred], dim=1)
                
        return idx

In [371]:
model = BgramRNN()

In [279]:
optim = torch.optim.Adam(model.parameters(), lr=lr)
for iter in range(iters):
    x, y = get_batch('train')
    out, loss = model(x, y)
    if iter % val_interval == 0 or iter == iters-1:
        val_loss = estimate_loss()
        print(f"Iter number: {iter}, train loss: {loss.item():.4f}, val loss:{val_loss:.4f}")
    model.zero_grad()
    loss.backward()
    optim.step()

Iter number: 0, train loss: 1.5617, val loss:1.7613


KeyboardInterrupt: 

In [425]:
context = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(context, max_length=300, index=2)[0].tolist()))


My hours, I sea; I were, good Barlowy.

SICINIUS:
Vold for hands leed us tespiser:
Hus, I in fight. As when that I that she are to the gentle to facce our graciof.
The workch'd,
Comen my lies and though mine.

SICINIUS:
With thoughts stay.

CORIOLAND:
Lord you we lizine holy untrestines, will I bein


In [30]:
time_length = 25
batch_size = 256

def get_batch(split):
    if split == 'train':
        data = train_data
    else:
        data = val_data
    idx = torch.randint(0, len(data) - time_length, size=(batch_size, 1))
    x = torch.stack([data[ix:ix + time_length] for ix in idx])
    y = torch.stack([data[ix + 1:ix + 1 + time_length] for ix in idx])

    return x, y

In [37]:
embd_dim = 128
n_embd = vocab_size
hidden_size = 256
lr = 3e-4
iters = 100000
val_iters = 1000
n_layer = 5
dropout=0.2

class BgramLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.embd = nn.Embedding(n_embd, embd_dim)
        self.lstm = nn.LSTM(embd_dim, hidden_size, batch_first=True, num_layers=n_layer, dropout=dropout)
        self.linear = nn.Linear(hidden_size, n_embd, bias=False)

    def forward(self, x, target=None):
        # x -> (B, T), target -> (B, T)
        x = self.embd(x) # -> (B, T, embd_dim)
        output, (h_t, c_t) = self.lstm(x) # output -> (B, T, hidden), h_t -> (B, hidden)
        logits = self.linear(output) # -> (B, T, n_embd)

        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # -> (B*T, C)
            target = target.view(B*T) # -> (B*T, 1)
            loss = F.cross_entropy(logits, target)

        return logits, loss

    def generate(self, idx, max_length, index=0):
        # idx -> (1, 1)
        if index == 0: #The fastest
            h_t = torch.zeros(n_layer, idx.shape[0], hidden_size, device=device)
            c_t = torch.zeros(n_layer, idx.shape[0], hidden_size, device=device)
            idx_prod = idx
            for t in range(max_length): ### The fastest
                x = self.embd(idx_prod) # -> (B, T, embd_dim)
                output, (h_t, c_t) = self.lstm(x, (h_t, c_t)) # output -> (B, T, hidden), h_t -> (B, hidden)
                logits = self.linear(h_t[-1])
                probs = logits.softmax(dim=-1)
                pred = torch.multinomial(probs, 1) # -> (1,1)
                idx = torch.cat([idx, pred], dim=1)
                idx_prod = torch.unsqueeze(idx[:, -1], 0)
        elif index == 1: ##### VERY SLOW
            for t in range(max_length):      
                logits, _ = self(idx) # logits -> 1, 1, (vocab_size)
                logits = logits[:,-1,:]
                probs = logits.softmax(dim=-1)
                pred = torch.multinomial(probs, 1) # -> (1,1)
                idx = torch.cat([idx, pred], dim=1)
        elif index == 2: ### Has Best perfomance
            for t in range(max_length):  
                idx_cond = idx[:, -time_length:]
                logits, _ = self(idx_cond) # logits -> 1, 1, (vocab_size)
                logits = logits[:,-1,:]
                probs = logits.softmax(dim=-1)
                pred = torch.multinomial(probs, 1) # -> (1,1)
                idx = torch.cat([idx, pred], dim=1)
                
        return idx

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BgramLSTM()
model = model.to(device)
print(f"Total number of parameters: {sum(p.numel() for p in model.parameters())}")

Total number of parameters: 2525568


In [43]:
optim = torch.optim.AdamW(model.parameters(), lr=lr)
for iter in range(iters):
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)
    out, loss = model(x, y)
    if iter % val_interval == 0 or iter == iters-1:
        val_loss = estimate_loss()
        print(f"Iter number: {iter}, train loss: {loss.item():.4f}, val loss:{val_loss:.4f}")
    model.zero_grad()
    loss.backward()
    optim.step()

Iter number: 0, train loss: 1.3176, val loss:1.5683
Iter number: 1000, train loss: 1.3280, val loss:1.5694
Iter number: 2000, train loss: 1.3507, val loss:1.5682
Iter number: 3000, train loss: 1.3426, val loss:1.5684
Iter number: 4000, train loss: 1.3222, val loss:1.5667
Iter number: 5000, train loss: 1.3130, val loss:1.5681
Iter number: 6000, train loss: 1.3132, val loss:1.5704
Iter number: 7000, train loss: 1.3079, val loss:1.5684
Iter number: 8000, train loss: 1.2980, val loss:1.5726
Iter number: 9000, train loss: 1.2497, val loss:1.5707


KeyboardInterrupt: 

In [44]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_length=10000, index=2)[0].tolist()))



WARWICK:
No more shows well will stain:
I'll not treachery what I have fool'd, and unknown as you to a slower.

POMPEY:
Now let's hear it, boy!

RATCLIFF:
Nay, if it plense your mother.

KING EDWARD IV:
But ass others, I had but by the spirity.

BUCKINGHAM:
Bad tormany hour that follows is plain, yet he would remmal here: he believe thee to:
procove this policy's disposition: therefore
to be truble that houses and owe
By over first opinion, though thou didst guess to departed to
stand!
Good business, you all; and thou wilt grant thy steeds
and so by right these with him to ask, and geint being the noble men do do I.

QUEEN ELIZABETH:
Plantagenet, that the time lives enryardly suffice with hoarding: a wind;
They rather it be, the drum,
I say ash up the flowers his second crown,
And frame our fellowships of fire: he'll to ervy nothing; and sure lothes untimely brook wakes,
And transport our treaty; and my strew glisty course and their virthrow me us well.
Thou, just you may shine to be

```
model 1:
BgramRNN(
  (embd): Embedding(65, 65)
time_length = 10)

results: train loss: 2.4077, val loss: 2.5242

mod 2elBgramRNN(
  (embd): Embedding(65, 05
  (linear): Linear(in_features=60, out_features=65, bias=False))
time_length = 10)
)

results: train lo2.21904077, val lo2.49865986

m 3odel:
BgramRNN(
  (embd): Embedding(65, 60)
  (rnn): RNN(60, 5, batch_first=True)
  (linear): Linear(in_features=5, out_features=65, bias=Falsetime_length = 10)
)

results: train loss: 2.4700, val loss: 2.5835


m 4odel:
BgramRNN(
  (embd): Embedding(65, 60)
  (rnn): RNN(60, 60, batch_first=True)
  (linear): Linear(in_features=60, out_features=65, bias=Falsetime_length = 10)
)

results: train loss: 1.7105, val loss: 2.1113


m 5odel:
BgramRNN(
  (embd): Embedding(65, 64)
  (rnn): RNN(64, 100, num_layers=2, batch_first=True)
  (linear): Linear(in_features=100, out_features=65, bias=Fatime_length = 10lse)
)

resultrain loss: 1.7024, val loss:1.


model 6:
BgramRNN(
  (embd): Embedding(65, 128)
  (rnn): RNN(128, 128, num_layers=3, batch_first=True)
  (linear): Linear(in_features=128, out_features=65, bias=False)
)
time_length = 25
resultrain loss: 1.6046, val loss:1.7603


model 7:
BgramLSTM(
  (embd): Embedding(65, 128)
  (lstm): LSTM(128, 128, num_layers=3, batch_first=True)
  (linear): Linear(in_features=128, out_features=65, bias=False
number of parameters: 412928)
)
tim
time_length = 25
 rain loss: 1.3582, val loss:1.6162
s
model 8:
BgramLSTM(
   (embd): Embedding(65, 128)
   (lstm): LSTM(128, 256, num_layers=5, batch_first=True)
   (linear): Linear(in_features=256, out_features=65, bias=False)number

 )
number of 2525568ters: 412928
time_length = 25
results:train loss: 1.2856, val loss:1.6423 lmodel 9:
BgramLSTM(
   (embd): Embedding(65, 128)
   (lstm): LSTM(128, 256, num_layers=5, batch_first=True, dropout=0.2)
   (linear): Linear(in_features=256, out_features=65, bias=False)
 ) + AdamW optimizer
number of parameters: 2525568
time_length = 25
results: train loss: 1.3426, val loss:1.5684oss:1.6162


:1.7603931893181113

```