In [31]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

#Hyperparameters
block_size = 64
batch_size = 128
max_iters = 3000
learning_rate = 3e-3
eval_iters = 500
# eval_interval = 500
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2 #20 percent dropout of the neuron

Using cpu device


In [32]:
#DATA-LOADER
chars = ""
with open ('Stock_Exchange.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', 'æ', '—', '‘', '’', '“', '”', '•', '™']


In [33]:
#TOKENIZER
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

print(data[:500])

tensor([ 0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1, 28, 40, 39, 45, 30, 39, 45, 44,  0,  0,  0,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1, 34, 12,  1, 48, 33, 26, 45,  1, 45, 33,
        30,  1, 44, 45, 40, 28, 36,  1, 30, 49, 28, 33, 26, 39, 32, 30,  1, 34,
        44,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 15,  0,  0,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 34, 34, 12,  1,
        45, 33, 30,  1, 38, 26, 43, 36, 30, 45, 11, 41, 37, 26, 28, 30,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1, 20,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1, 34, 34, 34, 12,  1, 45, 33, 30,  1, 38, 30, 38, 27, 30, 43, 44,
         1, 26, 39, 29,  1, 45, 33, 30, 34, 43,  1, 28, 37, 30, 43, 36, 44,  1,
         1,  1,  1,  1,  1,  1,  1,  1, 

1. **Character-Integer Mappings**:
   - `string_to_int = {ch: i for i, ch in enumerate(chars)}`: This line creates a dictionary that maps each character (`ch`) to a unique integer (`i`). The `enumerate(chars)` function generates pairs of indices and characters from the sorted list `chars` created earlier.
   - `int_to_string = {i: ch for i, ch in enumerate(chars)}`: This creates the reverse mapping from integers back to characters, facilitating easy conversion in both directions.

2. **Encoding and Decoding Functions**:
   - `encode = lambda s: [string_to_int[c] for c in s]`: This is a lambda function that converts a string (`s`) into a list of integers using the `string_to_int` mapping. It's used for encoding textual data into a numeric format that a machine learning model can process.
   - `decode = lambda l: ''.join([int_to_string[i] for i in l])`: This lambda function does the opposite, converting a list of integers (`l`) back into a string by mapping each integer back to its corresponding character using the `int_to_string` dictionary.

3. **Converting Text to a PyTorch Tensor**:
   - `data = torch.tensor(encode(text), dtype=torch.long)`: This line encodes the entire content of `text` using the `encode` function to transform it into a list of integers. Then, it converts this list into a PyTorch tensor of type `long` (suitable for discrete data like indices).

4. **Printing a Subset of the Tensor**:
   - `print(data[:500])`: This prints the first 500 elements of the tensor `data`. Each element in this tensor corresponds to an integer representation of a character from the original text. This part of the tensor might be used, for instance, in feeding batches of data into a neural network for tasks like text generation or classification.

In summary, this code effectively prepares textual data for machine learning applications by converting characters to a numerical format and leveraging PyTorch's tensor capabilities for handling and manipulating data efficiently. The `encode` and `decode` functions are essential for transforming data back and forth between readable text and a machine-friendly numerical format.

In [34]:
# print(chars)
# print(string_to_int)
# print(int_to_string)
# print(encode(text))
# print(data)

In [35]:
n = int(0.9*len(data))
train_data = data [:n]
val_data = data[:n]
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)


tensor([  5864, 188339, 164454,  53271, 184791, 122973, 157224,  63381,  94902,
        153210, 171735,  15722, 186140,  70584, 113093,  93561,  34918, 179446,
        142395,  53230, 102660, 185997, 139255, 125674, 189826,  71947,  50569,
          2873,  15420,  80491,  32579,  10690, 181497, 169035, 188838,  14180,
         75707,  30761, 140256,  98751, 111688, 191677, 182004,  57483, 171148,
         96052, 185071, 139984,  21180,  31603,  56840, 126532,  41599, 124627,
        124929, 105486,  96596,  28931,  50973,  31215,  78435,  92067,  49397,
         44428, 161739, 173665, 112722,  72076,  74927,  38309,  88176, 129837,
         27386, 102619, 100128, 176670, 168917,  57932,  95567,  17570, 175085,
         17793, 197790,  89980, 117004,   6604, 146971,  61151,  68814, 105243,
        112781,  69225,  11473,  75603, 154360,   6747,  68478,  36101, 104383,
        152658, 126653,  22719, 192572, 196941, 105572,  12087,  85390, 182426,
        117014, 120749,  11636,  86681, 

It involves splitting the data into training and validation sets, generating batches of data, and moving these batches to a computation device (like GPU). Let's walk through the code and its functionality:

1. **Data Splitting**:
   - `n = int(0.8*len(data))`: This calculates 80% of the total length of the `data` tensor to determine the size of the training set. This is a common practice to split data into training and validation sets, using most of the data for training and a smaller portion for validation.
   - `train_data = data[:n]`: This slices the first 80% of the `data` tensor to create the training dataset.
   - `val_data = data[:n]`: This line incorrectly assigns the training data slice to the validation data. It seems like a mistake. Normally, the validation data should be the remaining 20% of the data, e.g., `val_data = data[n:]`.

2. **Batch Generation**:
   - `get_batch(split)`: This function generates a batch of data for training or validation.
     - `data = train_data if split == 'train' else val_data`: It selects the training or validation dataset based on the `split` argument.
     - `ix = torch.randint(len(data) - block_size, (batch_size,))`: It randomly selects starting indices for the data slices. The length from which to sample is reduced by `block_size` to avoid index out of range when slicing data.
     - The two lines involving list comprehensions create inputs `x` and targets `y` for training:
       - `x = torch.stack([data[i:i+block_size] for i in ix])`: This creates a tensor of inputs where each input is a sequence of `block_size` characters starting from each index in `ix`.
       - `y = torch.stack([data[i+1:i+block_size+1] for i in ix])`: This creates a tensor of targets, which is essentially the `x` sequence shifted by one position to the right. This setup is typical for predictive models where the task is to predict the next character or item in a sequence.
     - `x, y = x.to(device), y.to(device)`: Moves the input and target tensors to a computation device, which could be a CPU or GPU.

3. **Function Call and Output**:
   - `x, y = get_batch('train')`: This line fetches a batch of training data.
   - It then prints out the inputs (`x`) and targets (`y`) to verify or inspect what the model will receive during training.

In [36]:
block_size = 40
train_data[:block_size+1]

tensor([ 0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1, 28, 40, 39, 45])

In [37]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([0]) target is tensor(0)
when input is tensor([0, 0]) target is tensor(0)
when input is tensor([0, 0, 0]) target is tensor(0)
when input is tensor([0, 0, 0, 0]) target is tensor(0)
when input is tensor([0, 0, 0, 0, 0]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) target is tensor(1)
when input is tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) target is tenso

In [38]:
@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
    

In [41]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out
    
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out
    
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

class GPTLanguageModel(nn.Module):
#embedding layer
#The Initialization Method:This method sets up a new instance of our neural network class. The parameter vocab_size is crucial because it tells the model how many unique tokens (e.g., words) it needs to handle. This information is foundational for creating an embedding layer that accurately reflects the vocabulary's size.
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd) 
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  #final Layer Normalization
        self.lm_head = nn.Linear(n_embd, vocab_size)  

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape


        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index

model = GPTLanguageModel(vocab_size)
m = model.to(device)

# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
# print(generated_chars)
        

Certainly! This snippet of code appears to come from a deep learning model written in PyTorch, a popular machine learning library. The code is defining parts of a neural network architecture, likely a Transformer or a variant, given the components mentioned. Let's break down what each line is doing:

1. **Token Embedding Table Creation:**
```python
self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
```
Here, `self.token_embedding_table` is being instantiated as an object of PyTorch's `nn.Embedding` class. This creates a matrix (the embedding table) with `vocab_size` rows and `n_embed` columns. `vocab_size` is the size of the vocabulary (the number of unique tokens the model can recognize), and `n_embed` is the size of the embedding vectors. Each token in the vocabulary will be associated with a vector of size `n_embed`. This is the vector that the model will learn to represent the semantic meaning of the tokens.

2. **Position Embedding Table Creation:**
```python
self.position_embedding_table = nn.Embedding(block_size, n_embed)
```
Similarly, `self.position_embedding_table` creates another embedding table specifically for positional encodings. `block_size` is likely the maximum sequence length that the model can handle, and `n_embed` is reused for consistency, indicating that positional embeddings are of the same dimension as token embeddings. This table will store the unique positional encodings that will be added to token embeddings to provide positional information to the model.

3. **Building the Transformer Blocks:**
```python
self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
```
This line constructs the core layers of the Transformer. `nn.Sequential` is used to create a sequence of layers—each `Block` represents one layer of the Transformer. `n_embed` is passed as an argument, which likely represents the size of the embeddings and the width of the multi-head attention and feedforward networks within each block. `n_head` represents the number of heads in the multi-head attention mechanism, and `n_layer` is the number of layers (blocks) to stack. This list comprehension creates a list of `Block` objects (each representing a transformer block/layer), which `nn.Sequential` turns into a single module.

4. **Layer Normalization:**
```python
self.ln = nn.LayerNorm(n_embed)
```
`self.ln` is a layer normalization module. Layer normalization is a technique used in deep learning models to stabilize the training process by normalizing the inputs across the features for each layer. `n_embed` here denotes that the normalization will be applied across the embedding dimension.

5. **Language Modeling Head:**
```python
self.lm_head = nn.Linear(n_embed, vocab_size)
```
Finally, `self.lm_head` defines a linear transformation that maps the high-dimensional output of the Transformer blocks back down to the vocabulary size. This is used in the context of language modeling to predict the probability of each token in the vocabulary, given the context provided by the input sequence. This is often the final layer in a Transformer that is being used for tasks like text generation, where the model needs to choose the next token in a sequence.

### Overall Context

In summary, this code is defining the components of a neural network for processing sequences of tokens. It initializes embedding tables for tokens and positions, stacks transformer blocks with attention and feed-forward layers, applies layer normalization, and prepares for generating predictions with a final linear layer. It's a concise definition of a Transformer architecture, ready to be trained on a dataset for tasks such as language modeling.

Would you like to explore any specific part of this code or its functionality in more detail, such as how the blocks operate or how training might proceed with this architecture?

In [42]:
# optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

# for iter in range(max_iters):
#     if iter % eval_iters==0:
#         losses = estimate_loss()
#         print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")

#     xb, yb = get_batch('train')
#     logits, loss = model.forward(xb, yb)
#     optimizer.zero_grad(set_to_none=True)
#     loss.backward()
#     optimizer.step()
# print(loss.item())
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
desired_steps = [0, 250, 500, 750, 1000]  # List of specific steps you want to print information for

for iter in range(10):
    if iter in desired_steps:  # Check if iter is one of the desired steps
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())


tensor([161540, 113807,  95375, 122664,  95300,  11242, 178921,  83442, 134163,
         33636,   6933, 133644,  70039, 129347, 185082, 103570,  90265,  95884,
        137815, 125750, 126373, 121046,  69222,  41098,  28438,  57442,  74875,
         48068, 113519, 139882,  53444, 152527, 193225,  41435,  96769,   7459,
        123255, 117286, 155592, 130796,  40439, 130452,  71057, 137656, 105041,
        189306,  63328, 179073, 160301, 130135, 113533, 117971, 152812, 120079,
         22576, 139821, 142636,  60124, 167749,  73816, 180809, 137783, 189861,
         33536, 191237, 132686, 157616,  53878,  20114,  65069, 145999, 130227,
         66912, 192639,  27206,  38760,  55392,  23802, 117997,   1030,  55184,
         28182,  16013,  38310,  31317, 172830,   2397, 166270,  67764, 192909,
        150687,  14603, 166876, 151067, 197661,  68497, 192760,  42250,  51272,
         89919, 166852, 115533,  33611,  25831, 133737, 172275, 182126,  96487,
        125154, 172122, 131821, 165224, 

KeyboardInterrupt: 

In [None]:
context = torch.zeros(1, 1, dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)


Kn0N/Gto,M'L 2Kn"xNY4Wn“V*'iKaaml£0vZ1kLpM(-*2Na$08czGJFJ•3CeJr”£MS'L3Yl3qHCp
/viGQi8•z!"CZv“Q"K"3)wTLjW1£M8Ac8TLR%XsV!!z!6j•jf)BR‘”IT;9Fl
’nV£)g49 LWjW_NæR/Zly40uyF1lgY””’7L9"anDj“pMg3d,.AF•b)h_Bn(M(PLj•ks%rtQ”)9;5”pq9y.2kAc;—fw l,£™EFJ_8TrH BJ*k’.S;yOi fV‘f‘yJ_;(HYe’HI•QG—E/”zCC,£M"o-Px8D7Hz$/Jfq6Kb6•tG$E0fMvk,ss5OMkænhæ%E6GEpXOx—ZGRU,P:CVb’QJzcNy%™_slcwzT$PN9bAdem*Os9oQU3!“r“K)vC,M ry5s9D7Xrt'"—5FW J.on“raEEko,MYu•Q™v9zQæRb26_*)—9YstV£4"i%ymvUouXwsumo‘RMFq!—yl3Ye’9q”sj4
WVUT2$0u
F;‘G) 5ZSM9™J
