In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

import time
import wandb

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# load les données

fichier = open('villes.txt')
donnees = fichier.read()
villes = donnees.replace('\n', ',').split(',')

In [4]:
# preparation des données

# on rajoute le token . au début et en fin
for ville, i in zip(villes, range(len(villes))):
    villes[i] = ville + '.'

# création du vocabulaire
vocabulaire = []

for ville in villes:
    for c in ville:
        if c not in vocabulaire:
            vocabulaire.append(c)

vocabulaire = sorted(vocabulaire)
vocabulaire[0] = '.'
vocabulaire[3] = " "

# pour convertir char <-> int
char_to_int = {}
int_to_char = {}

for (c, i) in zip(vocabulaire, range(len(vocabulaire))):
    char_to_int[c] = i
    int_to_char[i] = c

In [17]:
villes[:1]

['arbignieu.']

In [26]:
context_len = 3

for ville in villes[:1]:
    context = [0] * context_len

    for ch in ville:
        print(''.join([int_to_char[p] for p in context]) + "---> " + ch)

        context = context[1:] + [char_to_int[ch]]

...---> a
..a---> r
.ar---> b
arb---> i
rbi---> g
big---> n
ign---> i
gni---> e
nie---> u
ieu---> .


In [5]:
# création du dataset

context_len = 3

X = []
Y = []

for ville in villes:
    context = [0] * context_len

    for ch in ville:
        X.append(context)
        Y.append(char_to_int[ch])

        context = context[1:] + [char_to_int[ch]]


X = torch.tensor(X) # (M, 3), int64
Y = torch.tensor(Y) # (M), int64

n1 = int(0.8*X.shape[0])

X_train = X[:n1]
X_test = X[n1:]

Y_train = Y[:n1]
Y_test= Y[n1:]

In [6]:
def get_batch(batch_size, split):
    if split == 'train':
        ix = torch.randint(X_train.shape[0], (batch_size,))

        if device == 'cuda':
            Xb = X_train[ix].pin_memory().to(device, non_blocking=True)
            Yb = Y_train[ix].pin_memory().to(device, non_blocking=True)
        else:
            Xb = X_train[ix].to(device)
            Yb = Y_train[ix].to(device)
    else:
        ix = torch.randint(X_test.shape[0], (batch_size,))

        if device == 'cuda':
            Xb = X_test[ix].pin_memory().to(device, non_blocking=True)
            Yb = Y_test[ix].pin_memory().to(device, non_blocking=True)
        else:
            Xb = X_test[ix].to(device)
            Yb = Y_test[ix].to(device)
    
    return Xb, Yb

In [7]:
lr = 0.03
batch_size = 1024
embed_dim = 16
hidden_dim = 100

eval_interval = 500
eval_iter = 50

In [8]:
class BengioLM(nn.Module):
    def __init__(self):
        super().__init__()

        self.embed = nn.Embedding(len(vocabulaire), embed_dim)
        self.fc1 = nn.Linear(context_len * embed_dim, hidden_dim)

        self.lm_head = nn.Linear(hidden_dim, len(vocabulaire))

    def forward(self, x):
        x = self.embed(x).view(-1, context_len*embed_dim)

        z1 = self.fc1(x)
        a1 = F.tanh(z1)

        logits = self.lm_head(a1)

        return z1, a1, logits
    
    def sample(self, prompt, max_new_tokens):
        return

In [9]:
#writer = SummaryWriter(log_dir="runs/mlp/batch_size=" + str(batch_size) + "_lr=" + str(lr))
wandb.init(project="bengio_lm",
           config={
               "learning_rate": lr,
               "batch_size": batch_size,
               "embed_dim": embed_dim,
               "hidden_dim": hidden_dim,
               "context_len": context_len
           })

model = BengioLM()
model.to(device)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33malexandretl[0m. Use [1m`wandb login --relogin`[0m to force relogin


BengioLM(
  (embed): Embedding(44, 16)
  (fc1): Linear(in_features=48, out_features=100, bias=True)
  (lm_head): Linear(in_features=100, out_features=44, bias=True)
)

In [10]:
N = 10000
start_time = time.time()

wandb.watch(model, log="all")

for update_num in range(N):
    Xb, Yb = get_batch(batch_size, 'train')

    z1, a1, logits = model(Xb)

    loss = F.cross_entropy(logits, Yb)

    for p in model.parameters():
        p.grad = None

    loss.backward()

    for p in model.parameters():
        p.data += -lr * p.grad

    # eval : track loss (train & val), update_to_data
    if update_num % eval_interval == 0:
        to_log = {}

        with torch.no_grad():
            model.eval()
            for split in ['train', 'val']:
                loss_mean = 0
                for i in range(eval_iter):
                    Xb, Yb = get_batch(batch_size, split)
                    _, _, logits = model(Xb)

                    loss_mean += F.cross_entropy(logits, Yb).item()
                loss_mean /= eval_iter
                to_log["loss_" + split] = loss_mean
            model.train()

            scalars_dict = {}

            for name, p in model.named_parameters():
                scalars_dict[name] = (lr*p.grad.std() / p.data.std()).log10().item()
        
        wandb.log(to_log | {"update_to_data": scalars_dict}, step=update_num)

end_time = time.time()
num_examples_processed = N * batch_size

print("training throughput = {} examples/s".format(str(num_examples_processed/(end_time-start_time))))
wandb.log({"training_throughput": num_examples_processed/(end_time-start_time)})
wandb.log({"params_num": sum([p.numel() for p in model.parameters()])})

training throughput = 941639.2118829766 examples/s


In [11]:
wandb.finish()

0,1
loss_train,█▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
loss_val,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
params_num,▁
training_throughput,▁

0,1
loss_train,2.02803
loss_val,2.03244
params_num,10048.0
training_throughput,941639.21188


In [28]:
# sample

g = torch.Generator().manual_seed(40 + 7)

for _ in range(10):
    nom = "..."
    while nom[-1] != "." or len(nom) == 3:
        char_moins_3 = nom[-3]
        char_moins_2 = nom[-2]
        char_moins_1 = nom[-1]

        id_moins_3 = char_to_int[char_moins_3]
        id_moins_2 = char_to_int[char_moins_2]
        id_moins_1 = char_to_int[char_moins_1]

        x = torch.asarray([id_moins_3, id_moins_2, id_moins_1]).view(-1, context_len)

        Z1 = C[x].view(-1, context_len*16) @ W1 + b1
        A1 = torch.tanh(Z1)

        Z2 = A1 @ W2 + b2
        A2 = F.softmax(Z2, dim=1)

        next_id = torch.multinomial(A2, num_samples=1, replacement=True, generator=g).item()
        next_char = int_to_char[next_id]

        nom = nom + next_char
    print(nom)

...montol.
...naiselphin-di-rouer-shén-le-blans.
...houx.
...troffes-d'ole-fe-des.
...luphe-bon-pergdebotches.
...ossies.
...le vaivones.
...saincourt-sézan.
...bois.
...soppesse-ssis.


In [81]:
# batch_size    # TT    # GPU util
#  8            11,5k       32%
#  16           23k         32%
#  32           45k         32%
#  64           86k         36%
#  128          168k        40%
#  256          330k        36%
#  512          640k        38%
#  1024         1.14M       46% (optimal)
#  2048         1.5M        48%