In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import csv

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/transformer')



# Loading the training data
with open('/content/drive/MyDrive/transformer/preprocessed_data/x_train.csv', 'r') as file:
    reader = csv.reader(file)
    x_train = [list(map(int, row)) for row in reader]

y_train = []
with open('/content/drive/MyDrive/transformer/preprocessed_data/y_train.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        number = int(row[0])
        y_train.append(number)

with open('/content/drive/MyDrive/transformer/preprocessed_data/attention_mask_train.csv', 'r') as file:
    reader = csv.reader(file)
    attention_mask_train = [list(map(int, row)) for row in reader]

vocab_size = []
with open('/content/drive/MyDrive/transformer/preprocessed_data/vocab_size.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        number = int(row[0])
        vocab_size.append(number)

torch.manual_seed(420)

# Defining the hyperparameters:
batch_size = 512
block_size = 100
learning_rate = 1e-5
n_embed = 512
n_vocab = vocab_size[0]
head_count = 8
n_layers = 8
dropout = 0.2
epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f'Device: {device}')

Mounted at /content/drive
Device: cuda


In [2]:
# Loading the training data into the device:
x_train = torch.tensor(x_train, dtype=torch.long).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
attention_mask_train = torch.tensor(attention_mask_train, dtype=torch.long).to(device)

In [4]:
train_data = []
for i in range(len(x_train)):
  train_data.append([x_train[i], attention_mask_train[i], y_train[i]])

# Using DataLoader to divide training data into batches:
data_loader_train = DataLoader(train_data, batch_size = batch_size, shuffle = True)

from models import DecoderPosEnc, EncoderPosEnc, DecoderPosEncWave, EncoderPosEncWave, Decoder, Encoder

model = EncoderPosEncWave(n_vocab, n_embed, block_size, head_count, dropout, n_layers, device)
model = model.to(device)


print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr= learning_rate)

31.146498 M parameters


In [5]:
!nvidia-smi

Sun Jul 23 17:55:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    30W /  70W |    805MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
from tqdm import tqdm
# Training loop:
for epoch in range(epochs):
    batch_id = 0
    for batch in tqdm(iter(data_loader_train), desc = f'Epoch {epoch}'):
        # Saving checkpoints:
        if batch_id % (len(data_loader_train)//2) == 0:
            torch.save(model, f'/content/drive/MyDrive/transformer/checkpoints/EncoderPosEncWave/EncoderPosEncWave_{epoch}_{int(batch_id > 0)}.pt')
        # Forward and backward pass:
        model.train()
        optimizer.zero_grad()
        x, attention_mask, y = batch
        logits = model(x, attention_mask)
        loss = F.cross_entropy(logits, y)
        loss.backward()
        optimizer.step()
        batch_id += 1

Epoch 0: 100%|██████████| 132/132 [03:50<00:00,  1.75s/it]
Epoch 1: 100%|██████████| 132/132 [03:47<00:00,  1.73s/it]
Epoch 2: 100%|██████████| 132/132 [03:47<00:00,  1.72s/it]
Epoch 3: 100%|██████████| 132/132 [03:47<00:00,  1.72s/it]
Epoch 4: 100%|██████████| 132/132 [03:47<00:00,  1.73s/it]
Epoch 5: 100%|██████████| 132/132 [03:47<00:00,  1.72s/it]
Epoch 6: 100%|██████████| 132/132 [03:47<00:00,  1.73s/it]
Epoch 7: 100%|██████████| 132/132 [03:47<00:00,  1.72s/it]
Epoch 8: 100%|██████████| 132/132 [03:47<00:00,  1.73s/it]
Epoch 9: 100%|██████████| 132/132 [03:47<00:00,  1.72s/it]


In [7]:
# Saving the last checkpoint:
torch.save(model, f'/content/drive/MyDrive/transformer/checkpoints/EncoderPosEncWave/EncoderPosEncWave_10_0.pt')