In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import sys
sys.path.append('/usr/local/lib/python3.8/site-packages/')
sys.path.insert(0,'/content/drive/MyDrive/ERAV2/Assignment18')
%cd /content/drive/MyDrive/ERAV2/Assignment18
%pwd

/content/drive/MyDrive/ERAV2/Assignment18


'/content/drive/MyDrive/ERAV2/Assignment18'

In [3]:
!pip install tokenizers
!pip install torchtext
!pip install pytorch_lightning
!pip install datasets
!pip install tensorboard



In [4]:
import subprocess
import sys

try:
    from lion_pytorch import Lion
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", 'lion_pytorch'])
finally:
    from lion_pytorch import Lion

In [5]:
from config_file import get_config, get_weights_file_path
from train import train_model, run_validation, get_ds, get_model



In [6]:
from lion_pytorch import Lion

In [17]:
config = get_config()
config["batch_size"] = 16
config["preload"] = None
config["num_epochs"] = 2
import torch
torch.cuda.amp.autocast(enabled = True)


<torch.cuda.amp.autocast_mode.autocast at 0x7b493ed6c9d0>

In [8]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

MPS device not found.


In [31]:
import warnings
from tqdm import tqdm
import os
from pathlib import Path
import torchmetrics
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
#Define the device
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu" )
#device = torch.device("mps")
print('using device', device)
#make sure weights folders exist
Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
#Tensorboard
writer = SummaryWriter(config['experiment_name'])
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

#optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
#LR for lion is 1/3 of Adam
optimizer = Lion(model.parameters(), lr=1e-4/10, weight_decay=1e-2)


using device cuda
Max length of the source sentence : 309
Max length of the source target : 274


In [34]:
MAX_LR = 10**-3/10
STEPS_PER_EPOCH = len(train_dataloader)
EPOCHS = 25

In [35]:
#Scheduler
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=MAX_LR, steps_per_epoch=STEPS_PER_EPOCH
                                                , epochs=EPOCHS, pct_start=int(0.3*EPOCHS)/EPOCHS if EPOCHS!=1 else 0.5
                                                , div_factor = 100, three_phase=False, final_div_factor=100, anneal_strategy='linear'
                                                )

In [36]:
#If the user specified a model to preload before training , load it ,
initial_epoch = 0
global_step = 0
if config['preload']:
    model_filename = get_weights_file_path(config, config['preload'])
    print(f"Model preload file {model_filename}")
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])
    initial_epoch = state['epoch'] + 1
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
    print('preloaded')

scaler = torch.cuda.amp.GradScaler()
lr = [0.0]
for epoch in range(initial_epoch, EPOCHS):
    torch.cuda.empty_cache()
    model.train()
    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    for batch in batch_iterator:
        optimizer.zero_grad(set_to_none=True)
        encoder_input = batch['encoder_input'].to(device) #(b, seq_len)
        decoder_input = batch['decoder_input'].to(device) #(B, seq_len)
        encoder_mask = batch['encoder_mask'].to(device) #(B, 1, 1, seq_len)
        decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

        # Run the tensors through encoder, decoder and projection layer
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            encoder_output = model.encode(encoder_input, encoder_mask) #(B, seq_len, d_model)
            decoder_output  = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) #(B, seq_len, d_model)
            proj_output = model.project(decoder_output) #(B, seq_len , vocab_size)
            #compare the output with the label
            label = batch['label'].to(device) #(B, seq_len)
            # Compute the loss using simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
        batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}", "lr": f"{lr[-1]}"})
        #log the loss
        writer.add_scalar('train loss', loss.item(), global_step)
        writer.flush()
        #backpropagate the loss
        #loss.backward()
        scaler.scale(loss).backward()
        #update the weights
        #optimizer.step()
        scale = scaler.get_scale()
        scaler.step(optimizer)
        scaler.update()
        skip_r_sched = (scale > scaler.get_scale())
        if not skip_r_sched:
            scheduler.step()
        lr.append(scheduler.get_last_lr())
        global_step +=1
    #run validation at the end of every epoch
    #run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg),global_step)
    #save the model at the end of every epoch
    model_filename = get_weights_file_path(config, f"{epoch:02d}")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step
    }, model_filename)


Processing Epoch 00: 100%|██████████| 1819/1819 [05:11<00:00,  5.85it/s, loss=6.660, lr=[1.5136192271442036e-05]]
Processing Epoch 01: 100%|██████████| 1819/1819 [05:11<00:00,  5.84it/s, loss=5.758, lr=[2.9280160226201695e-05]]
Processing Epoch 02: 100%|██████████| 1819/1819 [05:12<00:00,  5.82it/s, loss=5.278, lr=[4.342412818096135e-05]]
Processing Epoch 03: 100%|██████████| 1819/1819 [05:11<00:00,  5.84it/s, loss=4.845, lr=[5.755254476908576e-05]]
Processing Epoch 04: 100%|██████████| 1819/1819 [05:13<00:00,  5.80it/s, loss=4.953, lr=[7.169651272384542e-05]]
Processing Epoch 05: 100%|██████████| 1819/1819 [05:11<00:00,  5.83it/s, loss=5.008, lr=[8.583270499528746e-05]]
Processing Epoch 06: 100%|██████████| 1819/1819 [05:10<00:00,  5.87it/s, loss=4.685, lr=[9.997667295004712e-05]]
Processing Epoch 07: 100%|██████████| 1819/1819 [05:12<00:00,  5.82it/s, loss=3.941, lr=[9.445721550302365e-05]]
Processing Epoch 08: 100%|██████████| 1819/1819 [05:08<00:00,  5.89it/s, loss=3.964, lr=[8.890

In [30]:
#If the user specified a model to preload before training , load it ,
initial_epoch = 17
EPOCHS = 20
global_step = 0
#if config['preload']:
model_filename = 'weights/tmodel_17.pt'#get_weights_file_path(config, config['preload'])
print(f"Model preload file {model_filename}")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])
initial_epoch = state['epoch'] + 1
optimizer.load_state_dict(state['optimizer_state_dict'])
global_step = state['global_step']
print('preloaded')

scaler = torch.cuda.amp.GradScaler()
lr = [0.0]
for epoch in range(initial_epoch, EPOCHS):
    torch.cuda.empty_cache()
    model.train()
    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    for batch in batch_iterator:
        optimizer.zero_grad(set_to_none=True)
        encoder_input = batch['encoder_input'].to(device) #(b, seq_len)
        decoder_input = batch['decoder_input'].to(device) #(B, seq_len)
        encoder_mask = batch['encoder_mask'].to(device) #(B, 1, 1, seq_len)
        decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

        # Run the tensors through encoder, decoder and projection layer
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            encoder_output = model.encode(encoder_input, encoder_mask) #(B, seq_len, d_model)
            decoder_output  = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) #(B, seq_len, d_model)
            proj_output = model.project(decoder_output) #(B, seq_len , vocab_size)
            #compare the output with the label
            label = batch['label'].to(device) #(B, seq_len)
            # Compute the loss using simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
        batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}", "lr": f"{lr[-1]}"})
        #log the loss
        writer.add_scalar('train loss', loss.item(), global_step)
        writer.flush()
        #backpropagate the loss
        #loss.backward()
        scaler.scale(loss).backward()
        #update the weights
        #optimizer.step()
        scale = scaler.get_scale()
        scaler.step(optimizer)
        scaler.update()
        skip_r_sched = (scale > scaler.get_scale())
        if not skip_r_sched:
            scheduler.step()
        lr.append(scheduler.get_last_lr())
        global_step +=1
    #run validation at the end of every epoch
    #run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg),global_step)
    #save the model at the end of every epoch
    model_filename = get_weights_file_path(config, f"{epoch:02d}")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step
    }, model_filename)


Model preload file weights/tmodel_17.pt
preloaded


Processing Epoch 18:   0%|          | 0/1819 [00:00<?, ?it/s, loss=1.896, lr=0.0]


ValueError: Tried to step 32747 times. The specified number of total steps is 32742

In [21]:
model_filename

'weights/tmodel_17.pt'

In [28]:
skip_r_sched

False

In [23]:
epoch

17