In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from typing import List
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


In [26]:
# Read Files
file_1 = open(f"./input.txt", 'r')
file_2 = open(f"./more.txt", 'r')

# Store corpus
corpus = file_1.read()
corpus += file_2.read()

# Store character_set
character_set = sorted(list(set(char for char in corpus)))

# Store index to char mapping
char_to_int = dict()
int_to_char = dict()



# Calculate mapping
for i, char in enumerate(character_set):
    # print(char, 2)
    char_to_int[char] = i
    int_to_char[i] = char


# print(len(int_to_char))

encoder = lambda text: torch.tensor([char_to_int[char] for char in text], dtype = torch.long)
decoder = lambda encoding: ''.join([int_to_char[digit.item()] for digit in encoding])

data = encoder(corpus)

# Create train/test slip
split_idx = int(.9*len(data))
train_set = data[:split_idx]
test_set = data[split_idx:]

block_size = 8
# print(corpus)

In [27]:
print(train_set.shape)

torch.Size([1012855])


In [28]:
def get_batch(batch_size, window_size, split='train'):
    # assign dataset
    data = train_set if split == 'train' else train_set
    # calculate max index 
    max_idx = len(train_set)  - block_size 
    # pick n = batch_size random indices
    idx = torch.randint(low=0, high=max_idx, size=(batch_size,)) # does not include max, so high =  max - 1 implicitly
    
    # Collect training and target sequences
    train_seq = torch.stack([torch.tensor(data[i:i + window_size]) for i in idx])
    target_seq = torch.stack([torch.tensor(data[i + 1:i + window_size + 1]) for i in idx])

    return train_seq, target_seq.view(-1) #THIS WAS FOR BIGRAM MODEL, THE VIEW FACTOR

In [29]:
# x, y = get_batch()
# # print(x, y)
# assert x.dtype == y.dtype == torch.long , "dtypes isnt long"
torch.manual_seed(42)


# Hyperparameters
vocab_size = len(int_to_char)
batch_size = 32
# block_size = block_size #this is just to show its referenced earlier
d_model = 512 #emb and concat, DIM of Input
d_k = 16 #ind head
d_v = d_k
num_heads = 4
num_blocks = 1
num_of_iter = 5000
dropout = .2

In [30]:
class Head(nn.Module):
    
    def __init__(self, key_dimension, value_dimension=None):
        super().__init__()
        # Store dimension of Q, K, V

        self.key_dimension = key_dimension
        self.value_dimension = key_dimension if not value_dimension else value_dimension
        
        # Define layers
        self.query = nn.Linear(d_model, self.key_dimension)
        self.key = nn.Linear(d_model, self.key_dimension)
        self.value = nn.Linear(d_model, self.value_dimension)
        
        
        # Register buffer for the lower triangular mask
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
    def forward(self, x):
        # Data will include embedding dimension
        B, T, C = x.shape  #c = d_model

        # Linear Transformations
        query = self.query(x) #(B, T, C)
        key = self.key(x) #(B, T, C)
        value = self.value(x) #(B, T, C)
        # print(query.shape, key.shape, value.shape)
        # Scaled dot-product attention
        # Reflects the similiarity between the query and the keys
        dp_att = query @ torch.transpose(key, -2, -1) #Dot Product of query and value
        dp_att *= self.key_dimension ** -.5 #Scaled
        
        # Masking
        # Ensures in decoder architecture, that tokens attend to only past values
        mask = self.tril[:T, :T]  # Ensure the mask matches the size of the scores
        dp_att_masked = torch.masked_fill(dp_att, mask == 0, float('-inf'))
        # # print('hey')
        # Softmax normalization
        # Normalizes the range of values, represted as probabilitiues 
        norm = F.softmax(dp_att_masked, -1)
        # norm = F.softmax(dp_att, -1)
        # print(norm)
        out = norm @ value #T x d_v
        # print(out.shape)
        return out
        
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, key_dimension, value_dimension=None):
        super().__init__()

        self.heads = nn.ModuleList([Head(key_dimension, value_dimension) for _ in range(num_heads)])
        self.fc1 = nn.Linear(num_heads*key_dimension, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # len is num_heads
        # head.weights.shape = d_model x d_k, 
        # head.shape T, d_k
        # print(1)
        result = [head(x) for head in self.heads]
        # print(2)
        concat_res = torch.concat(result, dim=-1) # shape = T x d_model=key_dim * num_heads, 
        # print(concat_res.shape)
        # print(3)
        output = self.fc1(concat_res)
        output = self.dropout(output)
        # print(4)
        # print(output.shape)
        return output #T x d_model
        
class Block(nn.Module):
    def __init__(self, num_heads, key_dimension, value_dimension=None):
        super().__init__()
        # head_size = d_model // num_heads
        self.attention = MultiHeadAttention(num_heads, key_dimension, value_dimension)
        self.ffn = nn.Sequential(nn.Linear(d_model, 4*d_model), nn.ReLU(), nn.Linear(4*d_model, d_model), nn.Dropout(dropout))
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        
    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x 

In [31]:
class LM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model) 
        self.position_emb = nn.Embedding(block_size, d_model) 
        # self.head = Head(d_k) 
        # self.MultiAtt = MultiHeadAttention(num_heads, d_k, d_v) block outputs dim = d_model
        self.block = nn.Sequential(Block(num_heads, d_k, d_v), 
                                   Block(num_heads, d_k, d_v),
                                   Block(num_heads, d_k, d_v), 
                                   nn.LayerNorm(d_model))
        
        self.fc1 = nn.Linear(d_model, d_model) # align these with architechture
        self.fc2 = nn.Linear(d_model, vocab_size)
    
        
    def forward(self, x:torch.tensor): # x = (B, T)
        
        B, T = x.shape
        x_emb = self.token_emb(x) #(B, T, C) C = d_model
        x_pos = self.position_emb(torch.arange(T))# (T, C) C = d_model
        x = x_emb + x_pos #x_pos is broadcasted(1 * B, T, C)
        
        # # single attention head
        # att = self.head(x)   
        
        # Multi attention head
        # att = self.MultiAtt(x)
        # print(att.shape) # txd_model
        # print('one')
        
        # Blocks of multiAttHeads
        att = self.block(x)
        # print(att.shape)
        fc1 = self.fc1(att) #   Right aline trailing dimensions
        # print('two')
        fc2 = self.fc2(fc1)
        
        return fc2
    

def generate_next_token(model, curr_seq_enc: torch.tensor, prediction_length: int=10):
    model.eval()
    with torch.no_grad():
        for _ in range(prediction_length):
            idx = curr_seq_enc[:,-1].view(-1, 1).to(torch.long)
            logits = LM(idx) 
            softmax = F.softmax(logits)
            # # print(softmax.shape)
            sample_token = torch.multinomial(softmax, num_samples=1)
            # # print(sample_token, curr_seq_enc)
            curr_seq_enc = torch.concat((curr_seq_enc, sample_token), dim=1)
            # print(curr_seq_enc)
    return curr_seq_enc

In [32]:
model = LM()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)

# Store Loss across iterations
losses = [] 

In [34]:
for iteration in range(num_of_iter):
    # Get batch
    inputs, targets = get_batch(batch_size, block_size)
    # print(inputs.shape, targets.shape)
    # print(inputs.shape, targets.shape)
    # # foward pass
    logits = model(inputs) #(B*T, C)
    # print(logits.shape) 
    # print(logits)
    # logits = logits.squeeze() #THIS IS FOR BIGRAM MODEL 
    # # print(logits.shape)
    # # print(logits.shape, targets.shape)
    loss = criterion(logits.view(-1, vocab_size) , targets)
    # # print(loss.item())
    # # Backward pass
    optimizer.zero_grad()  
    loss.backward()  # Compute gradients
    
    losses.append(loss.item())
    
    # # Update parameters
    optimizer.step()
    
    # Print loss every 100 iterations
    if iteration % 1000 == 0:
        print(f"Iteration {iteration}, Loss: {loss.item():.4f}")


Iteration 0, Loss: 2.0235
Iteration 1000, Loss: 1.8927
Iteration 2000, Loss: 2.0142
Iteration 3000, Loss: 1.8402
Iteration 4000, Loss: 1.8628


In [44]:
# # Generate new array that is average of previous and current tokens
# for col in range(testing.shape[0]):
#     run_sum = 0
#     for row in range(testing.shape[1]):
#         run_sum = (testing[col][row] + (run_sum))
#         result[col][row] = run_sum / (row + 1)
        
loss.item()

2.341407299041748

In [52]:
import numpy as np

def get_positional_encoding(sequence_length, embedding_dim):
    positional_encoding = np.zeros((sequence_length, embedding_dim))
    for pos in range(sequence_length):
        for i in range(0, embedding_dim, 2):
            positional_encoding[pos, i] = np.sin(pos / (10000 ** (i / embedding_dim)))
            positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((i + 1) / embedding_dim)))
    return positional_encoding

# Example usage for sequence length 8 and embedding dimension 8
sequence_length = 8
embedding_dim = 8
positional_encoding = get_positional_encoding(sequence_length, embedding_dim)

print(positional_encoding)


[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 8.41470985e-01  9.50415280e-01  9.98334166e-02  9.99500042e-01
   9.99983333e-03  9.99995000e-01  9.99999833e-04  9.99999950e-01]
 [ 9.09297427e-01  8.06578410e-01  1.98669331e-01  9.98000667e-01
   1.99986667e-02  9.99980000e-01  1.99999867e-03  9.99999800e-01]
 [ 1.41120008e-01  5.82753611e-01  2.95520207e-01  9.95503374e-01
   2.99955002e-02  9.99955000e-01  2.99999550e-03  9.99999550e-01]
 [-7.56802495e-01  3.01137463e-01  3.89418342e-01  9.92010661e-01
   3.99893342e-02  9.99920001e-01  3.99998933e-03  9.99999200e-01]
 [-9.58924275e-01 -1.03423189e-02  4.79425539e-01  9.87526020e-01
   4.99791693e-02  9.99875003e-01  4.99997917e-03  9.99998750e-01]
 [-2.79415498e-01 -3.20796458e-01  5.64642473e-01  9.82053935e-01
   5.99640065e-02  9.99820005e-01  5.99996400e-03  9.99998200e-01]
 [ 6.56986599e-01 -5.99437393e-01  6.44217687e-01  9.75599878e-01
   