<a href="https://colab.research.google.com/github/Trickshotblaster/nn-practices/blob/main/actual_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

In [2]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub<0.17,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface_hub, tokenizers
Successfully installed huggingface_hub-0.16.4 tokenizers-0.14.0


In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [4]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [5]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [6]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2023-09-29 20:10:57--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.138.29, 54.231.229.88, 52.217.140.48, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.138.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip’


2023-09-29 20:11:00 (58.1 MB/s) - ‘wikitext-103-raw-v1.zip’ saved [191984949/191984949]

Archive:  wikitext-103-raw-v1.zip
   creating: wikitext-103-raw/
  inflating: wikitext-103-raw/wiki.test.raw  
  inflating: wikitext-103-raw/wiki.valid.raw  
  inflating: wikitext-103-raw/wiki.train.raw  


In [7]:
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
tokenizer.train(files, trainer)

In [8]:
tokenizer.save("tokenizer-wiki.json")

In [9]:
tokenizer = Tokenizer.from_file("tokenizer-wiki.json")

In [10]:
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
out_tensor = torch.tensor(output.ids)

In [11]:
out_tensor

tensor([27253,    16,    93,    11,  5097,     5,  7961,  5112,  6218,     0,
           35])

In [12]:
tokenizer.decode(out_tensor.numpy())

"Hello , y ' all ! How are you ?"

In [13]:
with open("wikitext-103-raw/wiki.train.raw", 'r') as f:
  text = f.readlines()

In [14]:
text[:10]

[' \n',
 ' = Valkyria Chronicles III = \n',
 ' \n',
 ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game 

In [15]:
import random

In [16]:
text_len = len(text) - 1

In [28]:
context_len = 20
emb_dim = 10
tokenizer.enable_truncation(context_len)
tokenizer.enable_padding(direction='left', length=context_len)

In [29]:
vocab_size = tokenizer.get_vocab_size()

pretokenize? aaaa but it takes so long

In [38]:
text = torch.tensor([tokenizer.encode(t).ids for t in text])

In [31]:
def make_batch(batch):
  xs = []
  ys = []
  for sentence in batch:
    context = ""
    for word, next in zip(sentence, sentence[1:]):
      context += word
      xs.append(tokenizer.encode(context).ids)
      ys.append(torch.argmax(torch.tensor(tokenizer.encode(word).ids)))
  return torch.tensor(xs), torch.tensor(ys)

In [34]:
C = torch.randn((vocab_size, emb_dim))
w1 = torch.randn((emb_dim * context_len, 100)) * 0.1
b1 = torch.randn(100) * 0.01
w2 = torch.randn((100, vocab_size)) * 0.1
b2 = torch.randn(vocab_size) * 0.01

params = [C, w1, b1, w2, b2]
for p in params:
  p.requires_grad = True

In [35]:
import torch.nn.functional as F

In [37]:
num_epochs = 20
batch_size = 5

for epoch in range(num_epochs):
  batch = [text[random.randint(0, text_len)] for i in range(batch_size)]
  #words = torch.tensor([tokenizer.encode_batch(batch)[i].ids for i in range(batch_size)])
  x, y = make_batch(batch)

  emb = C[x].view(-1, emb_dim * context_len)
  l1 = (emb @ w1 + b1).tanh()
  out = (l1 @ w2 + b2).tanh()
  loss = F.cross_entropy(out, y)

  for p in params:
    p.grad = None

  loss.backward()
  for p in params:
    p.data -= p.grad
  if epoch % 10 == 0:
    print(loss.item())

11.345673561096191
10.530147552490234
11.43606185913086
10.17839241027832
11.347132682800293
11.282523155212402
10.193549156188965
11.276729583740234
10.610368728637695
10.938106536865234
10.547287940979004
10.613101959228516
9.521679878234863
10.457550048828125
9.47723388671875
10.999757766723633
10.536359786987305
11.111837387084961
10.838302612304688
9.468552589416504
