In [19]:
# Load the file that we will be using
with open("dexter_transcript/season_1/season_1_transcript_filtered.txt", 'r') as f:
    file_content = f.read()

# Print the number of characters in the file
print(f"Characters in File : {len(file_content)}")

Characters in File : 4103650


In [20]:
# Optionally print the beginning of the file
print(f"Beginning of File: \n{file_content[:500]}")

Beginning of File: 
Tonight's the night.

And it's going to happen again and again-- has to happen.

Nice night.

Miami is a great town. I love the Cuban food.

Pork sandwiches-- my favorite.

But I'm hungry for something different now.

There he is-- Mike Donovan.

He's the one.

You're mine now, so do exactly as I say.

What do you want?

I want you to be quiet.

Now drive.

Turn here.

You have to listen...

Do what I say.

Look.

No.

Uh, yes.

No, no!

It's horrible, isn't it? Isn't it?

Please...

Open your e


In [21]:
# Create a tokenizer dictionary with the maximum token length
# This helps to reduce the number of tokens needed for any arbitrary string
str_to_int = {}
token_count = 0
max_char_seq = 1 # Set the largest substring allowed to be tokenized
for length in range(1, max_char_seq + 1):
    sub_strings = []
    for i in range(len(file_content) - length + 1):
        sub_strings.append(file_content[i:i+length])
    for i, substring in enumerate(list(set(sub_strings)), token_count):
        str_to_int[substring] = i
    token_count = len(str_to_int)
    print(f"Length : {length} Token Count : {token_count}")

int_to_str = dict(zip(str_to_int.values(), str_to_int.keys()))
    
# Define the char -> int encoding function
def encode(s):
    encoding = []
    max_token_length = max_char_seq if len(s) >= max_char_seq else len(s)
    idx = 0
    while idx < len(s):
        if s[idx:max_token_length+idx] in str_to_int:
            encoding.append(str_to_int[s[idx:max_token_length + idx]])
            idx += max_token_length
        else:
            max_token_length -= 1
    return encoding

# Define the int -> char decoding function
def decode(int_list):
    return ''.join([int_to_str[i] for i in int_list])

# Simple test of the functions
print(encode("hello world!"))
print(decode(encode("hello world!")))

Length : 1 Token Count : 92
[23, 6, 53, 53, 46, 21, 19, 46, 18, 53, 30, 75]
hello world!


In [22]:
# Encode the entire text dataset and store it into a torch.Tensor
import torch

data = torch.tensor(encode(file_content), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([4103650]) torch.int64


In [24]:
threshold = 0.8
n = int(threshold * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Size of training set: {len(train_data)}")
print(f"Size of validation set : {len(val_data)}")

Size of training set: 3282920
Size of validation set : 820730


In [25]:
# This gives an idea of the amount of context given at a single time to the GPT
sample_block_size = 8

x = train_data[:sample_block_size]
y = train_data[1:sample_block_size+1]
for i in range(sample_block_size):
    context = x[:i+1]
    target = y[i]
    print(f"When input is {context} the target: {target}")

When input is tensor([71]) the target: 46
When input is tensor([71, 46]) the target: 2
When input is tensor([71, 46,  2]) the target: 26
When input is tensor([71, 46,  2, 26]) the target: 52
When input is tensor([71, 46,  2, 26, 52]) the target: 23
When input is tensor([71, 46,  2, 26, 52, 23]) the target: 72
When input is tensor([71, 46,  2, 26, 52, 23, 72]) the target: 89
When input is tensor([71, 46,  2, 26, 52, 23, 72, 89]) the target: 32


In [26]:
torch.manual_seed(1234)
batch_size = 4 # How many sequences we process in parallel
block_size = 8 # Maximum context length for predictions

# Generates a small batch of data of inputs and targets (x,y)
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f"X: {xb.shape}\n{xb}")
print(f"Y: {yb.shape}\n{yb}")
print("--------")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()} the target is: {target}")

X: torch.Size([4, 8])
tensor([[21, 30, 46, 67, 91, 72, 31, 21],
        [21, 29, 21, 19, 26, 32, 23, 31],
        [21, 23, 29, 32, 21, 32, 46, 43],
        [21, 22, 21, 19, 26, 53, 53, 28]])
Y: torch.Size([4, 8])
tensor([[30, 46, 67, 91, 72, 31, 21, 18],
        [29, 21, 19, 26, 32, 23, 31, 21],
        [23, 29, 32, 21, 32, 46, 43,  6],
        [22, 21, 19, 26, 53, 53, 28, 65]])
--------
When input is [21] the target is: 30
When input is [21, 30] the target is: 46
When input is [21, 30, 46] the target is: 67
When input is [21, 30, 46, 67] the target is: 91
When input is [21, 30, 46, 67, 91] the target is: 72
When input is [21, 30, 46, 67, 91, 72] the target is: 31
When input is [21, 30, 46, 67, 91, 72, 31] the target is: 21
When input is [21, 30, 46, 67, 91, 72, 31, 21] the target is: 18
When input is [21] the target is: 29
When input is [21, 29] the target is: 21
When input is [21, 29, 21] the target is: 19
When input is [21, 29, 21, 19] the target is: 26
When input is [21, 29, 21, 19

In [27]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
torch.manual_seed(1234)

class BigramLanguageModel(nn.Module):
    def __init__(self, token_count):
        super().__init__()
        self.token_embedding_table = nn.Embedding(token_count, token_count)

    def forward(self, idx, targets=None):
        pred = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            batch, block, tokens = pred.shape
            pred = pred.view(batch*block, tokens)
            targets = targets.view(batch*block)
            loss = F.cross_entropy(pred, targets)

        return pred, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (batch, block) array of indices in current context
        for _ in range(max_new_tokens):
            pred, loss = self(idx)
            # using last block get probabilities
            pred = pred[:, -1, :]
            prob = F.softmax(pred, dim=1)
            # sample from the distribution
            idx_next = torch.multinomial(prob, num_samples=1)
            # add sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    
m = BigramLanguageModel(token_count)
pred, loss = m(xb, yb)
print(f"Ideal Loss : {-math.log(1/token_count)}")
print(f"Loss : {loss}")

temp_idx = torch.zeros((1, 1), dtype=torch.long)
print(f"Untrained Output :\n{decode(m.generate(temp_idx, max_new_tokens=100)[0].tolist())}")

Ideal Loss : 4.5217885770490405
Loss : 4.6465864181518555
Untrained Output :
%WitÃ!0.ªoSIDaYHG­lma4:5 t9,sc0âZFPjrKNJEaLªGNJ73pXK4T[6?(nmt3xC21aJv±'5ªBXCJr?$"jªG57X™R*mÂmY(JxCTtY


In [28]:
# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [29]:
batch_size = 32
for steps in range(1000):
    xb. yb = get_batch('train')

    pred, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.9241483211517334


In [30]:
print(f"Trained Output :\n{decode(m.generate(temp_idx, max_new_tokens=500)[0].tolist())}")

Trained Output :
%YMgut¿G­wVWW0¡ye8câL1VBCvB?WEAWHOJª3gK iZWn9dd d.0avvrdT8yDVQ)R:/kvguCn?(e™ sºqGxaqhasº*cWxA(0!
z!Y:y707±gT±"ijFCyWdsB8qA
V"Oqa!DP'/n"JdªHÃBsÃChl%â tx!n¡g$3
LHªnFjm³b$âr)AbFPO6CJv¿3wI±?KL]yxXJc0rVZ!bgcZTSHºfªAD6n"Xq375bI7º$5iczwOa[)5[9z¿OwyIg$ODX')21FH­I±G1jt¡1VªeoujObaqO$).A]o™-gtlou¿[YDR 9Z'5or!bIÂc³bt¿EcyJrpUz*tIjRk"8G.X.)¿*kYisswi,¿u(ÃK0)xeQzdp1t"w¡O!Nd*T$qH-Z1c.B™m%Me/1YKvqVM/©'QV!BaN™YH-49,CbÃ(IHSºBB™.z!7xjFq1ª%x±O©±9³ e,t¿r%zqh*Ã!bqn4m/bMRbKNt7myIscq,'C3Nht6¡Ml3[haeQOPD:)pX™ d-l%el')i:h,A


In [None]:
# Self-attention

torch.manual_seed(1234)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

In [None]:
# We want x[b, t] = mean_{i<=t} x[b, i] aka a running average
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev, 0)

