In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

import time
import wandb

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# load les données

fichier = open('villes.txt')
donnees = fichier.read()
villes = donnees.replace('\n', ',').split(',')
villes = [ville for ville in villes if len(ville) > 2]
villes = sorted(villes, key=len)

In [4]:
# création du vocabulaire

vocabulaire = sorted(list(set(''.join(villes))))
vocabulaire = ["<SOS>", "<EOS>"] + vocabulaire

# pour convertir char <-> int
char_to_int = {}
int_to_char = {}

for (c, i) in zip(vocabulaire, range(len(vocabulaire))):
    char_to_int[c] = i
    int_to_char[i] = c

In [5]:
num_sequences = len(villes)
max_len = max([len(ville) for ville in villes]) + 2 # account for <SOS> and <EOS>

X = torch.zeros((num_sequences, max_len))

for i in range(num_sequences):
    X[i] = torch.tensor([char_to_int['<SOS>']] + [char_to_int[c] for c in villes[i]] + [char_to_int['<EOS>']] + [-1] * (max_len - len(villes[i]) - 2))

n_split = int(0.9*X.shape[0])

X_train = X[:n_split]
X_val = X[n_split:]

In [21]:
def get_batch(split, batch_size):
    # returns a batch, according to the data pipeline written in the W&B report

    idx_seed = torch.randint(high=X_train.shape[0], size=(1,)).item() #sample la ligne seed autour de laquelle on va piocher les exemples

    idx = torch.randint(low = max(0, idx_seed - 4 * batch_size), high = min(X_train.shape[0], idx_seed + 4 * batch_size), size=(batch_size,)) #samples les indices du batch à produire
    #pq 4 ? bon compromis entre assez large pour pas bcp de répétitions, assez petit pour pas bcp de padding (cf data.ipynb)

    idx_sorted, _ = torch.sort(idx) #on les ordonne pour recuperer facilement la longueur de la plus grande seq. du batch

    X_batch = X_train[idx_sorted] #on extrait la matrice qui va produire Xb et Yb

    max_len_batch = torch.sum(torch.ne(X_batch[-1], -1)) #longueur de la plus grande seq. du batch : torch.ne(X_batch[-1], -1) crée une matrice masque, avec True si diff de -1, False si egal a -1

    Xb = X_batch[:, :max_len_batch-1] #on selectionne que jusqu'a la len max - 1 (<EOS> du plus long inutile) (le reste n'est que padding)
    Yb = X_batch[:, 1:max_len_batch] #meme que Xb, mais décalé de 1 (avec le <EOS> mais sans le <SOS>)

    #Xb[Xb == 1] = -1 #on remplace le <EOS> par du padding (totalement optionnel)

    return Xb.pin_memory().to('cuda', non_blocking=True), Yb.pin_memory().to('cuda', non_blocking=True)

In [22]:
lr = 0.03
batch_size = 1024
embed_dim = 16
hidden_dim = 100
context_len = 3
data = "new"

eval_interval = 500
eval_iter = 50

In [23]:
class BengioLM(nn.Module):
    def __init__(self):
        super().__init__()

        self.embed = nn.Embedding(len(vocabulaire), embed_dim)
        self.fc1 = nn.Linear(context_len * embed_dim, hidden_dim)

        self.lm_head = nn.Linear(hidden_dim, len(vocabulaire))

    def forward(self, x):
        x = self.embed(x).view(-1, context_len*embed_dim)

        z1 = self.fc1(x)
        a1 = F.tanh(z1)

        logits = self.lm_head(a1)

        return z1, a1, logits
    
    def sample(self, prompt, max_new_tokens):
        return

In [24]:
Xb, Yb = get_batch('train', 64)

In [29]:
Xb

tensor([[ 0.,  7., 12., 32., 24., 13., 16., 16., 19., 18.],
        [ 0.,  6.,  5., 18., 18., 13., 34., 22.,  9., 23.],
        [ 0., 16.,  9.,  2.,  6.,  5., 22.,  8., 19., 18.],
        [ 0., 10., 19., 25., 22.,  8., 22.,  5., 13., 18.],
        [ 0., 16.,  9.,  2., 20.,  5., 13., 16., 16., 29.],
        [ 0., 25., 12., 22., 27., 13., 16., 16.,  9., 22.],
        [ 0.,  7., 19., 22.,  6., 13., 34., 22.,  9., 23.],
        [ 0.,  5., 13., 11., 25., 22.,  5., 18.,  8.,  9.],
        [ 0.,  5., 17.,  6., 22., 13., 34., 22.,  9., 23.],
        [ 0., 22.,  5., 22., 35.,  7., 19., 25., 22., 24.],
        [ 0., 16., 19., 18., 11., 10., 19., 23., 23., 35.],
        [ 0., 17., 19., 18., 24.,  7., 12.,  9., 18., 25.],
        [ 0., 17.,  5., 22., 23.,  5., 18.,  9., 13., 28.],
        [ 0., 17.,  5., 22., 23.,  5., 18.,  9., 13., 28.],
        [ 0., 23.,  9., 20., 17.,  9., 22., 13.,  9., 23.],
        [ 0., 17.,  5., 16., 16., 13., 34., 26., 22.,  9.],
        [ 0., 20., 25., 29., 17.,  5., 2

In [31]:
Yb

tensor([[ 7., 12., 32., 24., 13., 16., 16., 19., 18.,  1.],
        [ 6.,  5., 18., 18., 13., 34., 22.,  9., 23.,  1.],
        [16.,  9.,  2.,  6.,  5., 22.,  8., 19., 18.,  1.],
        [10., 19., 25., 22.,  8., 22.,  5., 13., 18.,  1.],
        [16.,  9.,  2., 20.,  5., 13., 16., 16., 29.,  1.],
        [25., 12., 22., 27., 13., 16., 16.,  9., 22.,  1.],
        [ 7., 19., 22.,  6., 13., 34., 22.,  9., 23.,  1.],
        [ 5., 13., 11., 25., 22.,  5., 18.,  8.,  9.,  1.],
        [ 5., 17.,  6., 22., 13., 34., 22.,  9., 23.,  1.],
        [22.,  5., 22., 35.,  7., 19., 25., 22., 24.,  1.],
        [16., 19., 18., 11., 10., 19., 23., 23., 35.,  1.],
        [17., 19., 18., 24.,  7., 12.,  9., 18., 25.,  1.],
        [17.,  5., 22., 23.,  5., 18.,  9., 13., 28.,  1.],
        [17.,  5., 22., 23.,  5., 18.,  9., 13., 28.,  1.],
        [23.,  9., 20., 17.,  9., 22., 13.,  9., 23.,  1.],
        [17.,  5., 16., 16., 13., 34., 26., 22.,  9.,  1.],
        [20., 25., 29., 17.,  5., 25., 2

In [43]:
for i in range(Xb.shape[0]):
    nom_X = ""
    for id in Xb[i]:
        nom_X += int_to_char[int(id.item())]

    nom_Y = ""
    for id in Yb[i]:
        nom_Y += int_to_char[int(id.item())]
    print(nom_X)
    print(nom_Y)
    print("**************************************************")

<SOS>châtillon
châtillon<EOS>
**************************************************
<SOS>bannières
bannières<EOS>
**************************************************
<SOS>le bardon
le bardon<EOS>
**************************************************
<SOS>fourdrain
fourdrain<EOS>
**************************************************
<SOS>le pailly
le pailly<EOS>
**************************************************
<SOS>uhrwiller
uhrwiller<EOS>
**************************************************
<SOS>corbières
corbières<EOS>
**************************************************
<SOS>aigurande
aigurande<EOS>
**************************************************
<SOS>ambrières
ambrières<EOS>
**************************************************
<SOS>rarécourt
rarécourt<EOS>
**************************************************
<SOS>longfossé
longfossé<EOS>
**************************************************
<SOS>montchenu
montchenu<EOS>
**************************************************
<SOS>marsaneix
marsaneix<EOS

In [10]:
#writer = SummaryWriter(log_dir="runs/mlp/batch_size=" + str(batch_size) + "_lr=" + str(lr))
wandb.init(project="bengio_lm",
           config={
               "learning_rate": lr,
               "batch_size": batch_size,
               "embed_dim": embed_dim,
               "hidden_dim": hidden_dim,
               "context_len": context_len,
               "data_loading": data
           })

model = BengioLM()
model.to(device)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malexandretl[0m. Use [1m`wandb login --relogin`[0m to force relogin


BengioLM(
  (embed): Embedding(45, 16)
  (fc1): Linear(in_features=48, out_features=100, bias=True)
  (lm_head): Linear(in_features=100, out_features=45, bias=True)
)

In [15]:
N = 10000
start_time = time.time()

wandb.watch(model, log="all")

for update_num in range(N):
    Xb, Yb = get_batch('train', batch_size)

    z1, a1, logits = model(Xb)

    loss = F.cross_entropy(logits, Yb)

    for p in model.parameters():
        p.grad = None

    loss.backward()

    for p in model.parameters():
        p.data += -lr * p.grad

    # eval : track loss (train & val), update_to_data
    if update_num % eval_interval == 0:
        to_log = {}

        with torch.no_grad():
            model.eval()
            for split in ['train', 'val']:
                loss_mean = 0
                for i in range(eval_iter):
                    Xb, Yb = get_batch(split, batch_size)
                    _, _, logits = model(Xb)

                    loss_mean += F.cross_entropy(logits, Yb).item()
                loss_mean /= eval_iter
                to_log["loss_" + split] = loss_mean
            model.train()

            scalars_dict = {}

            for name, p in model.named_parameters():
                scalars_dict[name] = (lr*p.grad.std() / p.data.std()).log10().item()
        
        wandb.log(to_log | {"update_to_data": scalars_dict}, step=update_num)

end_time = time.time()
num_examples_processed = N * batch_size

print("training throughput = {} examples/s".format(str(num_examples_processed/(end_time-start_time))))
wandb.log({"training_throughput": num_examples_processed/(end_time-start_time)})
wandb.log({"params_num": sum([p.numel() for p in model.parameters()])})

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

In [16]:
wandb.finish()

In [28]:
# sample

g = torch.Generator().manual_seed(40 + 7)

for _ in range(10):
    nom = "..."
    while nom[-1] != "." or len(nom) == 3:
        char_moins_3 = nom[-3]
        char_moins_2 = nom[-2]
        char_moins_1 = nom[-1]

        id_moins_3 = char_to_int[char_moins_3]
        id_moins_2 = char_to_int[char_moins_2]
        id_moins_1 = char_to_int[char_moins_1]

        x = torch.asarray([id_moins_3, id_moins_2, id_moins_1]).view(-1, context_len)

        Z1 = C[x].view(-1, context_len*16) @ W1 + b1
        A1 = torch.tanh(Z1)

        Z2 = A1 @ W2 + b2
        A2 = F.softmax(Z2, dim=1)

        next_id = torch.multinomial(A2, num_samples=1, replacement=True, generator=g).item()
        next_char = int_to_char[next_id]

        nom = nom + next_char
    print(nom)

...montol.
...naiselphin-di-rouer-shén-le-blans.
...houx.
...troffes-d'ole-fe-des.
...luphe-bon-pergdebotches.
...ossies.
...le vaivones.
...saincourt-sézan.
...bois.
...soppesse-ssis.


In [81]:
# batch_size    # TT    # GPU util
#  8            11,5k       32%
#  16           23k         32%
#  32           45k         32%
#  64           86k         36%
#  128          168k        40%
#  256          330k        36%
#  512          640k        38%
#  1024         1.14M       46% (optimal)
#  2048         1.5M        48%