In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [None]:
class CausalSelfAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len):
    super().__init__()

    # Assume d_v = d_k
    self.d_k = d_k
    self.n_heads = n_heads

    self.key = nn.Linear(d_model, d_k * n_heads)
    self.query = nn.Linear(d_model, d_k*n_heads)
    self.value = nn.Linear(d_model, d_k*n_heads)


    #final linear layer
    self.fc = nn.Linear(d_k * n_heads, d_model)

    # casual mask
    # make it so that diagonal is 0 too
    # this way we don't have to shift the inputs to make targets
    cm = torch.tril(torch.ones(max_len, max_len))
    self.register_buffer(
        "causal_mask",
        cm.view(1,1, max_len, max_len) #(T,T) --> (1,1,T,T)
    )
  def forward(self, q, k, v, pad_mask=None):
    q = self.query(q)
    k = self.key(k)
    v = self.value(v)

    N = q.shape[0]
    T = q.shape[1]

    # change the shape to:
    # (N, T, h, d_k) -> (N, h, T, d_k)
    # in order for matrix multiply to work properly
    q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)

    #compute attention weights
    #compute attention weights
    #(N, h, T, d_k) x (N, h, d_k, T) --> (N, h, T, T)
    # scaling score  = query * key Transpose/ square root of(dimension)
    attn_scores = q @ k.transpose(-2,-1)/ math.sqrt(self.d_k)

    #we mask
    if pad_mask is not None:
      attn_scores = attn_scores.masked_fill(
          #mask:(N,T)-> mask[:, None, None, :] -> mask:(N,1,1,T)
          #this allows us to broadcast correctly
          pad_mask[:, None, None, :] == 0, float('-inf')
      )
    attn_scores = attn_scores.masked_fill(
        self.causal_mask[:,:,:T,:T] == 0 , float("-inf")
    )

    #attention weights
    attn_weights = F.softmax(attn_scores, dim=-1)

    #compute attention weights-weighted values
    # (N, h, T, T) X (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    #reshape it back before final linear layer
    A = A.transpose(1,2) # (N, h, T, d_k) --> (N, T, h, d_k)
    #contiguous allows us to set our values correctly in memory
    A = A.contiguous().view(N, T, self.d_k * self.n_heads) #(N, T, h*d_k)

    #projection
    return self.fc(A)


In [None]:
t= torch.tril(torch.ones(7, 5))

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = CausalSelfAttention(d_k, d_model, n_heads, max_len)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model *4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout = nn.Dropout(p=dropout_prob)

  def forward(self, x, pad_mask=None):
    x = self.ln1(x + self.mha(x,x,x,pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)
    #equations
    #PE(pos,2i) = sin(pos/10000^2i/dmodel)
    #PE(pos, 2i + 1) = cos(pos/10000^2i/dmodel)

    #arange goes from 0 to max lenght
    position = torch.arange(max_len).unsqueeze(1) #Pos
    exp_term = torch.arange(0, d_model, 2) #2i
    div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model)) #10000^-2i/dmodel
    pe = torch.zeros(1, max_len, d_model) #(1, T, D) to brodcast to (N, T, D)
    pe[0, :, 0::2] = torch.sin(position * div_term) #PE(pos,2i) = sin(pos/10000^2i/dmodel)
    pe[0, :, 1::2] = torch.cos(position * div_term) #PE(pos, 2i + 1) = cos(pos/10000^2i/dmodel)
    self.register_buffer("pe", pe) # save and load correctly register and does not required gradient

  def forward(self, x):
    # x.shape: N x T x D
    x = x + self.pe[:, :x.size(1), :] #accessing register buffer
    return self.dropout(x)

In [None]:

class Decoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        TransformerBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob
        ) for _ in range(n_layers)]

    self.transformer_blocks = nn.Sequential(*transformer_blocks) #encapsulate in sequential
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size) #outputs vocab size


  def forward(self, x, pad_mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)

    x = self.ln(x)
    x = self.fc(x) #many-to-many

    return x

In [None]:
model = Decoder(20_0000, 1024, 16, 64, 4, 2, 0.1)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda:0


Decoder(
  (embedding): Embedding(200000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05

In [None]:
x = np.random.randint(0, 20_000, size =(8,512))
x_t = torch.tensor(x).to(device)

In [None]:
y = model(x_t)
y.shape

torch.Size([8, 512, 200000])

In [None]:
mask = np.ones((8, 512))
mask[: 256:] = 0
mask_t = torch.tensor(mask).to(device)

In [None]:
y = model(x_t, mask_t)
y.shape

torch.Size([8, 512, 200000])

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
from transformers import GPT2Tokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
path_lines = ["/content/Harry Potter 1 Sorcerer's_Stone.txt",
              "/content/Harry Potter 2 Chamber_of_Secrets.txt",
              "/content/Harry Potter 3 Prisoner of Azkaban.txt",
              "/content/Harry Potter 4 and the Goblet of Fire.txt",
              "/content/Book 5 - The Order of the Phoenix.txt",
              "/content/Book 6 - The Half Blood Prince.txt",
              "/content/Book 7 - The Deathly Hallows.txt"]

In [None]:
sentences = []
for path in path_lines:
  with open(path, mode="r") as file:
    content = file.read()
    sentences = content.split("\n\n")

In [None]:
sentences[1]

''

In [None]:
max = 512
text = 0
for _ in range(2):
  for idx, i in enumerate(sentences):
    if i =="" or i==" ":
      del sentences[idx]
    if i.startswith("Page | ") or len(i) < 3:
      del sentences[idx]
    if len(i) >= max:
      temp = sentences.pop(idx).split(".")
      sentences.extend(temp)
      text = idx

In [None]:
from pprint import pprint

In [None]:
len(sentences[idx])

537

In [None]:
pprint(len(sentences[text]))

208


In [None]:

#from sklearn.model_selection import train_test_split

In [None]:
# train, test = train_test_split(sentences, test_size = 0.25, random_state=771994)

In [None]:
import json

with open("harry.json", "w") as f:
  for x in sentences:
    j = {"sentence": x}
    s = json.dumps(j)
    f.write(f"{s}\n")

In [None]:
len(sentences)

6867

In [None]:
# checkpoint = 'distilbert-base-cased'
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
from datasets import load_dataset

# # we'll use the same dataset, just ignore the labels
# raw_datasets = load_dataset("glue", "sst2")

In [None]:
raw_datasets = load_dataset("json", data_files='harry.json', split="train")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_datasets = raw_datasets.train_test_split(test_size=0.001)

In [None]:
# # we will use the same dataset but ignore the labels
# raw_datasets = load_dataset("eturok/harry_potter_tokenized")

In [None]:
# decode_text = tokenizer.decode(raw_datasets["train"]["input_ids"][0])

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 6860
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 7
    })
})

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/6860 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 6860
    })
    test: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 7
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence"])

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle = True,
    batch_size = 32,
    collate_fn=data_collator
)

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7cec7dfa4940>

In [None]:
#check how it works
for batch in train_loader:
  for k, v in batch.items():
    print("k", k, "v.shape:", v.shape)
  break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k input_ids v.shape: torch.Size([32, 76])
k attention_mask v.shape: torch.Size([32, 76])


In [None]:
tokenizer.max_model_input_sizes[checkpoint]

512

In [None]:
model = Decoder(
    vocab_size = tokenizer.vocab_size,
    max_len = tokenizer.max_model_input_sizes[checkpoint],
    d_k = 32,
    d_model=128,
    n_heads=8,
    n_layers=4,
    dropout_prob=0.1
)
model.to(device)

Decoder(
  (embedding): Embedding(28996, 128)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=128, out_features=256, bias=True)
        (query): Linear(in_features=128, out_features=256, bias=True)
        (value): Linear(in_features=128, out_features=256, bias=True)
        (fc): Linear(in_features=256, out_features=128, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=512, out_features=128, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((128

In [None]:
#Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) #avoid padding tokens for the loss
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from datetime import datetime

In [None]:
# A function to encapsulate the training loop
def train(model, criterion, optimizer, train_loader, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      # move data to GPU
      batch = {k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # shift targets backwards
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets, shifts=-1, dims=1)
      targets[:, -1] = tokenizer.pad_token_id

      # Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      # outputs are N x T x V
      # but PyTorch expects N x V x T
      # print("outputs:", outputs)
      # print("targets:", targets)
      loss = criterion(outputs.transpose(2, 1), targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))

      # Backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[it] = train_loss

    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')

  return train_losses

In [None]:
train_losses = train(
    model, criterion, optimizer, train_loader, epochs=300
)

Epoch 1/300, Train Loss: 4.3464, Duration: 0:00:25.206484
Epoch 2/300, Train Loss: 4.1923, Duration: 0:00:24.666445
Epoch 3/300, Train Loss: 4.0677, Duration: 0:00:24.707204
Epoch 4/300, Train Loss: 3.9634, Duration: 0:00:24.814130
Epoch 5/300, Train Loss: 3.8689, Duration: 0:00:24.838477
Epoch 6/300, Train Loss: 3.7881, Duration: 0:00:24.675958
Epoch 7/300, Train Loss: 3.7113, Duration: 0:00:24.650396
Epoch 8/300, Train Loss: 3.6442, Duration: 0:00:24.754393
Epoch 9/300, Train Loss: 3.5787, Duration: 0:00:24.573009
Epoch 10/300, Train Loss: 3.5231, Duration: 0:00:24.481655
Epoch 11/300, Train Loss: 3.4660, Duration: 0:00:24.812073
Epoch 12/300, Train Loss: 3.4115, Duration: 0:00:24.608718
Epoch 13/300, Train Loss: 3.3619, Duration: 0:00:24.821597
Epoch 14/300, Train Loss: 3.3184, Duration: 0:00:24.801487
Epoch 15/300, Train Loss: 3.2735, Duration: 0:00:24.598446
Epoch 16/300, Train Loss: 3.2281, Duration: 0:00:24.757297
Epoch 17/300, Train Loss: 3.1918, Duration: 0:00:24.541674
Epoch 

KeyboardInterrupt: ignored

In [None]:
valid_loader = DataLoader(
    tokenized_datasets["test"],
    batch_size = 1,
    collate_fn =data_collator
)

In [None]:
model.eval()
for batch in valid_loader:
  #move data to GPU
  batch = {k: v.to(device) for k,v in batch.items()}
  outputs = model(batch["input_ids"], batch["attention_mask"])
  break

In [None]:
outputs.shape

torch.Size([1, 92, 28996])

In [None]:
torch.argmax(outputs, axis=-1)

tensor([[  789,  1173,  5750,   117,  1403,   117,  1103,  1108,  1977,  1103,
          1104,  1103, 11035,  1234,   117,  1172,  1103, 11035,  1106,   102,
           787,  1205,  1154,  1103, 22116,  1733,   117,  2133,  1125, 16571,
          2355,  1106,  1117,   179,   117,  1213,  1193,  1283,  1916,  1105,
          1486,   170,  2922,   117,  2777,   170,  7015,  1106,  1117,  1762,
          3276,   119,  1108,   170, 22116,  1733,   119,  1105,   170,   119,
          1105,  1171,  1106,  1193,   119,  1103, 15694,  1176,  1119,  1127,
          1149,  2038,  1115,  2045,  1194,  1103,  1632,  1385, 15736,  2787,
          1176,  1103,  1105,  1127,  6205, 12923,  1140,  2365,  1174,   119,
           102,  1115]], device='cuda:0')

In [None]:
prediction_ids = torch.argmax(outputs, axis=-1) # save these predictions

In [None]:
tokenizer.decode(prediction_ids[0]) #integers to tokens

'“ then indeed,g, the was forward the of the staircase people, them the staircase to [SEP] ’ down into theravers, whose had bending holding to his j, aroundly awayting and saw a throat, Where a scream to his heartnd. was aravers. and a. and back toly. the inadequate like he were out Great that walked through the great old tangledway like the and were searching cloak himfuled. [SEP] that'

In [None]:
tokenizer.decode(batch["input_ids"][0]) #input of the sentence

'[CLS] And, still clanking, he hurried toward one of the many doors leading off the hall. Harry looked back at Travers, who was still rooted to the spot looking abnormally vacant, and made his decision : With a flick of his wand he made Travers come with them, walking meekly in their wake as they reached the door and passed into the rough stone passageway beyond, which was lit with flaming torches. [SEP]'

In [None]:
tokenizer.decode(torch.concat((batch["input_ids"][0,:5], prediction_ids[:, 4])))

'[CLS] And, still clang'

In [None]:
# generate something
prompt = "Wizard"

tokenized_prompt = tokenizer(prompt, return_tensors='pt')

# prepare inputs + get rid of SEP token at the end
input_ids = tokenized_prompt['input_ids'][:, :-1].to(device)
mask = tokenized_prompt['attention_mask'][:, :-1].to(device)

for _ in range(100):
  outputs = model(input_ids, mask)
  prediction_id = torch.argmax(outputs[:, -1, :], axis=-1)

  input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))
  mask = torch.ones_like(input_ids)

  if prediction_id == tokenizer.sep_token_id:
    break

tokenizer.decode(input_ids[0][1:-1])

'Wizarding territory the only way to the only way to be stopped, to the only chance... He could not find out what he had done...'

In [None]:
# generate something
for _  in range(3):
  prompt = input("")

  tokenized_prompt = tokenizer(prompt, return_tensors='pt')

  # prepare inputs + get rid of SEP token at the end
  input_ids = tokenized_prompt['input_ids'][:, :-1].to(device)
  mask = tokenized_prompt['attention_mask'][:, :-1].to(device)

  for _ in range(100):
    outputs = model(input_ids, mask)
    prediction_id = torch.argmax(outputs[:, -1, :], axis=-1)

    input_ids = torch.hstack((input_ids, prediction_id.view(1, 1)))
    mask = torch.ones_like(input_ids)

    if prediction_id == tokenizer.sep_token_id:
      break

  print(tokenizer.decode(input_ids[0][1:-1]))

coming
coming from the pocket of the forest flew Harry ’ s own left anducksack for a moment, then realized that it was a few seconds before he could do not do too full of a full of hidden drive, but a full of movement, a picture of odd mass on a weed against a prickling
what are you saying ?
what are you saying? People were the same, the only one of the Death Eaters, their wands, waiting for the first time, waiting for the first time, missed which were becoming more than twenty - to scabbed and missed and missed
who are we missing?
who are we missing? ” said Harry.
