#  <font color='#FFE15D'><b>Language Modeling</b></font>

# 🔴 **Environment Setup**

In [1]:
!pip install -q torch==2.3.0 torchtext==0.18.0 torchmetrics portalocker>=2.0.0

In [2]:
!pip install -q wandb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.4/311.4 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# 🔴 **Import Libs**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, GloVe

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split

from torch import optim
from torch.nn import functional as F

import tqdm
import torchmetrics as tm
import wandb

import os
from collections import Counter

import ipywidgets as widgets
from IPython.display import display

In [4]:
!python --version

Python 3.10.12


In [5]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

numpy --> 1.26.4
torch --> 2.3.0+cu121
torchtext --> 0.18.0+cpu
tqdm --> 4.66.5


# 🔴 **Utils**

In [6]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [7]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

In [8]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
      torch.cuda.manual_seed(seed)

# 🔴 **Arguments**

In [9]:
seed = 8

batch_size = 80
seq_len = 70

embedding_dim = 300

num_layers = 3
hidden_dim = 1150
dropoute = 0.1
dropouti = 0.65
dropouth = 0.3
dropouto = 0.4
weight_drop = 0.5

lr = 30
wd = 1.2e-6
momentum = 0

clip = 0.25

wandb_enable = True

In [10]:
wandb_arg_name = input('Please input the WandB argument (run) name:')

Please input the WandB argument (run) name:weight_drop


In [11]:
wandb_arg_name

'weight_drop'

# 🔴 **Dataset**

## 🟠 Load the Dataset

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
! cp -r /content/drive/MyDrive/Deep_learning_projects/1_language_modeling/wikitext-2 /content/

## 🟠 Build vocabulary and save it

In [16]:
def load_dataset(path):
  with open(path) as f :
    dataset = f.read()
  return dataset

def tokenize_dataset(dataset):
  tokens = []
  for line in dataset.split('\n'):
    tokens.extend(tokenizer(line.strip()))
  return tokens

In [17]:
tokenizer = get_tokenizer("basic_english")

In [18]:
train_dataset = load_dataset("/content/wikitext-2/wiki.train.tokens")
valid_dataset = load_dataset("/content/wikitext-2/wiki.valid.tokens")
test_dataset = load_dataset("/content/wikitext-2/wiki.test.tokens")

train_tokens = tokenize_dataset(train_dataset)
valid_tokens = tokenize_dataset(valid_dataset)
test_tokens = tokenize_dataset(test_dataset)

all_tokens = train_tokens + valid_tokens + test_tokens
print("Train Tokens: " , len(train_tokens))
print("Valid Tokens: " , len(valid_tokens))
print("Test Tokens: " , len(test_tokens))
print("Total number of tokens:", len(all_tokens))

Train Tokens:  2049990
Valid Tokens:  214417
Test Tokens:  241859
Total number of tokens: 2506266


In [19]:
tokenized_datasets = [train_tokens]

special_tokens = ['<unk>']

vocab = build_vocab_from_iterator(tokenized_datasets, specials=special_tokens)

vocab.set_default_index(vocab['<unk>'])

vocab_size = len(vocab)

torch.save(vocab, "./Wikitext_train_vocab.pt")
print("Vocabulary size:", vocab_size)

Vocabulary size: 28782


## 🟠 Transform the data

In [20]:
def data_process(raw_text_iter, seq_len):
  data = torch.cat([torch.LongTensor(vocab(tokenize_dataset(line))) for line in raw_text_iter.split("\n")])

  M = len(data) // seq_len

  r = len(data) % seq_len
  data = torch.cat((data, torch.LongTensor([0]))) if r==0 else data

  inputs = data[:M*seq_len]
  inputs = inputs.reshape(-1, seq_len)

  targets = data[1:M*seq_len+1]
  targets = targets.reshape(-1, seq_len)

  return inputs, targets

In [22]:
X_train ,y_train =data_process(train_dataset, seq_len)
X_valid ,y_valid =data_process(valid_dataset, seq_len)
X_test ,y_test =data_process(test_dataset, seq_len)

X_train.shape ,y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

(torch.Size([29285, 70]),
 torch.Size([29285, 70]),
 torch.Size([3063, 70]),
 torch.Size([3063, 70]),
 torch.Size([3455, 70]),
 torch.Size([3455, 70]))

## 🟠 Custom dataset

🔰 Write a custom dataset class for LanguageModelDataset.

In [23]:
class LanguageModelDataset(Dataset):

  def __init__(self, inputs, targets):
    self.inputs = inputs
    self.targets = targets

  def __len__(self):
    return self.inputs.shape[0]

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

In [24]:
train_set = LanguageModelDataset(X_train, y_train)
valid_set = LanguageModelDataset(X_valid, y_valid)
test_set = LanguageModelDataset(X_test, y_test)

## 🟠 Define a dataloader if needed

🔰 Write dataloaders for the training, validation, and test sets.

In [25]:
set_seed(seed)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [26]:
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape, x_batch

(torch.Size([80, 70]),
 torch.Size([80, 70]),
 tensor([[ 1985,    13,     1,  ...,  1985,    13,     1],
         [  104,     2,    57,  ..., 16138,  2285,    92],
         [    2,    22,   100,  ...,   116,    22,     2],
         ...,
         [   22,     0,   173,  ...,    37, 12908,     6],
         [    6,    43,  8400,  ...,    93,     3,     1],
         [25828,    65,    46,  ...,     3,   179,  1108]]))

In [27]:
set_seed(seed)

for inputs, targets in train_loader:
  print(inputs[0, 0], targets[0, 0])
  break

tensor(1985) tensor(13)


# 🔴 **Model**

In [28]:
class WeightDrop(torch.nn.Module):

  def __init__(self, module, weights, dropout=0):
    super(WeightDrop, self).__init__()
    self.module = module
    self.weights = weights
    self.dropout = dropout
    self._setup()

  def widget_demagnetizer_y2k_edition(*args, **kwargs):
    return

  def _setup(self):
    if issubclass(type(self.module), torch.nn.RNNBase):
      self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

      for name_w in self.weights:
        print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
        w = getattr(self.module, name_w)
        del self.module._parameters[name_w]
        self.module.register_parameter(name_w + '_raw', nn.Parameter(w.data))

  def _setweights(self):
    for name_w in self.weights:
      raw_w = getattr(self.module, name_w + '_raw')
      w = None
      # w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
      mask = torch.nn.functional.dropout(torch.ones_like(raw_w), p=self.dropout, training=True) * (1 - self.dropout)
      setattr(self.module, name_w, raw_w * mask)

  def forward(self, *args):
    self._setweights()
    return self.module.forward(*args)

In [29]:
def embedded_dropout(embed, words, dropout=0.1, scale=None):
  if dropout:
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(
        embed.weight) / (1 - dropout)
    masked_embed_weight = mask * embed.weight
  else:
    masked_embed_weight = embed.weight
  if scale:
    masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight

  padding_idx = embed.padding_idx
  if padding_idx is None:
    padding_idx = -1

  embedding = torch.nn.functional.embedding(words, masked_embed_weight,
                                            padding_idx, embed.max_norm, embed.norm_type,
                                            embed.scale_grad_by_freq, embed.sparse)
  return embedding

In [30]:
class LockedDropout(nn.Module):
  def __init__(self):
    super(LockedDropout, self).__init__()

  def forward(self, x, dropout):
    if not self.training or not dropout:
      return x
    m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
    mask = m.requires_grad_(False) / (1 - dropout)
    mask = mask.expand_as(x)
    return mask * x

🔰 AWD-LSTM Language Model

In [31]:
class LanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
               dropoute=0.2, dropouti=0.2, dropouth=0.2, dropouto=0.2,
               weight_drop=0.2):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.embedding_dim = embedding_dim

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight.data.uniform_(-0.1, 0.1)

    self.lstms = []
    self.lstms.append(nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms.append(nn.LSTM(hidden_dim, hidden_dim, num_layers=1, dropout=0, batch_first=False))
    self.lstms.append(nn.LSTM(hidden_dim, embedding_dim, num_layers=1, dropout=0, batch_first=False))
    if weight_drop > 0:
      self.lstms = [WeightDrop(lstm, ['weight_hh_l0'], dropout=weight_drop) for lstm in self.lstms]
    self.lstms = nn.ModuleList(self.lstms)

    self.fc = nn.Linear(embedding_dim, vocab_size)

    self.fc.weight = self.embedding.weight

    self.lockdrop = LockedDropout()
    self.dropoute = dropoute
    self.dropouti = dropouti
    self.dropouth = dropouth
    self.dropouto = dropouto
    # print(dropoute, dropouti, dropouth, dropouto)

  def forward(self, src):
    embedding = embedded_dropout(self.embedding, src, dropout=self.dropoute if self.training else 0)
    embedding = self.lockdrop(embedding, self.dropouti)

    new_hiddens = []
    for l, lstm in enumerate(self.lstms):
      embedding, _ = lstm(embedding)
      if l != self.num_layers-1:
        embedding = self.lockdrop(embedding, self.dropouth)

    embedding = self.lockdrop(embedding, self.dropouto)

    prediction = self.fc(embedding)
    return prediction

# 🔴 **Config**

In [33]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [34]:
loss_fn = nn.CrossEntropyLoss()

metric = tm.text.Perplexity().to(device)

In [36]:
key_file = '/content/key.txt'

if os.path.exists(key_file):
    with open(key_file) as f:
        key = f.readline().strip()
    wandb.login(key=key)
else:
    print("Key file does not exist. Please create the key file with your wandb API key.")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# 🔴 **Train ➰**

In [37]:
def train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch=None):
  model.train()
  loss_train = AverageMeter()
  metric.reset()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.t().to(device)
      targets = targets.t().to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

      loss.backward()

      nn.utils.clip_grad.clip_grad_norm_(model.parameters(), max_norm=clip)

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))
      metric.update(outputs, targets)

      tepoch.set_postfix(loss=loss_train.avg, metric=metric.compute().item())

  return model, loss_train.avg, metric.compute().item()

# 🔴 **Evaluation**

🔰 This is the template for evaluation function, change it if needed.

In [38]:
def evaluate(model, test_loader, loss_fn, metric):
  model.eval()
  loss_eval = AverageMeter()
  metric.reset()

  with torch.inference_mode():
    for inputs, targets in test_loader:
      inputs = inputs.t().to(device)
      targets = targets.t().to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())
      loss_eval.update(loss.item(), n=len(targets))

      metric(outputs, targets)

  return loss_eval.avg, metric.compute().item()

# 🔴 **Training Process 〽️**

## 🟠 Finding Hyper-parameters

### 🟡 **Step 1:** Calculate the loss for an untrained model using a few batches.


In [None]:
model = LanguageModel(len(vocab), embedding_dim=300,
                      hidden_dim=512, num_layers=2,
                      dropout_embd=0.5, dropout_rnn=0.2).to(device)

inputs, targets = next(iter(train_loader))
inputs = inputs.to(device)
targets = targets.to(device)

with torch.no_grad():
  outputs = model(inputs)
  loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

print(loss)

In [None]:
outputs.reshape(-1, outputs.shape[-1]).shape, targets.flatten().shape

In [None]:
torch.cuda.empty_cache()

### 🟡 **Step 2:** Try to train and overfit the model on a small subset of the dataset.

In [None]:
model = LanguageModel(len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropout_embd=dropout_embd, dropout_rnn=dropout_rnn).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.9, momentum=0.9)

In [None]:
mini_train_size = 1000
_, mini_train_dataset = random_split(train_set, (len(train_set)-mini_train_size, mini_train_size))
mini_train_loader = DataLoader(mini_train_dataset, 20)

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
  model, _, _ = train_one_epoch(model, mini_train_loader, loss_fn, optimizer, metric, epoch)

### 🟡 **Step 3:** Train the model for a limited number of epochs, experimenting with various learning rates.

In [None]:
num_epochs = 1

for lr in [20, 15, 10, 7.5, 5, 2.5]:
  print(f'LR={lr}')

  model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop, pretrained=True).to(device)
  # model = torch.load('model.pt')

  optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)

  for epoch in range(num_epochs):
    model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

  print()

LR=20
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
100%|██████████████████████████████████████████████████| 367/367 [01:31<00:00,  4.02batch/s, loss=8.88, metric=7.16e+3]



LR=15
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:26<00:00,  4.24batch/s, loss=6.75, metric=857]



LR=10
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:20<00:00,  4.56batch/s, loss=6.75, metric=858]



LR=7.5
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:28<00:00,  4.16batch/s, loss=6.79, metric=894]



LR=5
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:28<00:00,  4.16batch/s, loss=6.84, metric=939]



LR=2.5
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


  2%|▉                                                   | 7/367 [00:01<01:40,  3.59batch/s, loss=9.56, metric=1.42e+4]


KeyboardInterrupt: ignored

### 🟡 Step 4: Create a small grid using the weight decay and the best learning rate.





In [None]:
num_epochs = 1

for lr in [7, 8, 14, 13, 12, 11, 10, 9]:
  for wd in [1.2e-6]:
    print(f'LR={lr}, WD={wd}')

    model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop, pretrained=True).to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

    for epoch in range(num_epochs):
      model, _, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch)

    print()

LR=7, WD=1.2e-06
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:28<00:00,  4.15batch/s, loss=6.68, metric=795]



LR=8, WD=1.2e-06
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


100%|██████████████████████████████████████████████████████| 367/367 [01:29<00:00,  4.11batch/s, loss=6.61, metric=745]



LR=14, WD=1.2e-06
.
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


  3%|█▌                                                 | 11/367 [00:02<01:32,  3.83batch/s, loss=11.3, metric=8.28e+4]


KeyboardInterrupt: ignored

### 🟡 Step 5: Train model for longer epochs using the best model from step 4.





In [None]:
model = LanguageModel(len(vocab), embedding_dim=300,
                      hidden_dim=512, num_layers=2,
                      dropout_embd=0.5, dropout_rnn=0.2).to(device)

In [None]:
model = torch.load('/content/model-ppl_133.pt')

In [None]:
lr = 3
wd = 1e-6
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9)

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs = 30

for epoch in range(1, num_epochs+1):
  # WandB
  run = wandb.init(
        project="language-modeling-lstms",
        config={
            "learning_rate": lr,
            "epochs": num_epochs,
        })

  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

## 🟠 Main Loop

In [39]:
torch.cuda.empty_cache()

In [40]:
set_seed(seed)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)

In [58]:
len(vocab)

28782

In [42]:
set_seed(seed)

model = LanguageModel(vocab_size=len(vocab), embedding_dim=embedding_dim,
                      hidden_dim=hidden_dim, num_layers=num_layers,
                      dropoute=dropoute, dropouti=dropouti,
                      dropouth=dropouth, dropouto=dropouto,
                      weight_drop=weight_drop).to(device)
model

Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0


LanguageModel(
  (embedding): Embedding(28782, 300)
  (lstms): ModuleList(
    (0): WeightDrop(
      (module): LSTM(300, 1150)
    )
    (1): WeightDrop(
      (module): LSTM(1150, 1150)
    )
    (2): WeightDrop(
      (module): LSTM(1150, 300)
    )
  )
  (fc): Linear(in_features=300, out_features=28782, bias=True)
  (lockdrop): LockedDropout()
)

In [None]:
# model = torch.load('model.pt')

🔰 Define optimizer and Set learning rate and weight decay.

In [52]:
set_seed(seed)

lr = 7.5
wd = 1.2e-6
# momentum = 0.9

optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)
# optimizer = optim.SGD([{'params': model.embedding.parameters(), 'lr': 0.1*lr},
#                        {'params': model.lstms.parameters(), 'lr': lr}],
#                       weight_decay=wd, momentum=momentum)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 7.5
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 1.2e-06
)

In [55]:
if wandb_enable:
  wandb.init(
      project='LM-AWD-LSTM',
      name=wandb_arg_name,
      config={
          'lr': lr,
          'momentum': momentum,
          'batch_size': batch_size,
          'seq_len': seq_len,
          'hidden_dim': hidden_dim,
          'embedding_dim': embedding_dim,
          'num_layers': num_layers,
          'dropout_embed': dropoute,
          'dropout_in_lstm': dropouti,
          'dropout_h_lstm': dropouth,
          'dropout_out_lstm': dropouto,
          'clip': clip,
      }
  )

🔰 Write code to train the model for `num_epochs` epoches.

In [45]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [56]:
set_seed(seed)
num_epochs = 5

for epoch in range(1, num_epochs+1):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                    train_loader,
                                                    loss_fn,
                                                    optimizer,
                                                    metric,
                                                    epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                      valid_loader,
                                      loss_fn,
                                      metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  if wandb_enable:
    wandb.log({"metric_train": metric_train, "loss_train": loss_train,
                "metric_valid": metric_valid, "loss_valid": loss_valid})

  epoch_counter += 1

Epoch 1: 100%|██████████| 367/367 [02:13<00:00,  2.75batch/s, loss=4.16, metric=64.4]


Valid: Loss = 4.816, Metric = 124.1



Epoch 2:  11%|█         | 41/367 [00:15<02:01,  2.68batch/s, loss=4.34, metric=77]


KeyboardInterrupt: 

In [None]:
wandb.finish()

In [57]:
! cp /content/model.pt /content/drive/MyDrive/Deep_learning_projects/1_language_modeling/AWD

## 🟠 Plot

🔰 Plot learning curves

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(range(epoch_counter), loss_train_hist, 'r-', label='Train')
plt.plot(range(epoch_counter), loss_valid_hist, 'b-', label='Validation')

plt.xlabel('Epoch')
plt.ylabel('loss')
plt.grid(True)
plt.legend()

# 🔴 **Test**

🔰 Test your model using data from the test set and images that are not present in the dataset.

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
loss_valid, metric_valid = evaluate(model, valid_loader, loss_fn, metric)
metric_valid

In [None]:
loss_test, metric_test = evaluate(model, test_loader, loss_fn, metric)
metric_test

# 🔴 **Generate**

🔰 Your mission is to write a `generate` function and use a desired sentence to evaluate the model

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
prompt = 'In a galaxy far, far away, there'

indices = vocab(tokenizer(prompt))
itos = vocab.get_itos()

max_seq_len = 35
for i in range(max_seq_len):
  src = torch.LongTensor(indices).to(device)

  with torch.no_grad():
    prediction = model(src)

  # Method 1
  # idx = torch.argmax(prediction[-1])
  # itos = vocab.get_itos()
  # itos[idx]

  # Method 2
  temperature = 0.5
  probs = torch.softmax(prediction[-1]/temperature, dim=0)

  idx = vocab['<ukn>']
  while idx == vocab['<ukn>']:
    idx = torch.multinomial(probs, num_samples=1).item()

  token = itos[idx]
  prompt += ' ' + token

  if idx == vocab['.']:
    break

  indices.append(idx)

print(prompt)

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):
  if seed is not None:
    torch.manual_seed(seed)

  indices = vocab(tokenizer(prompt))
  itos = vocab.get_itos()

  for i in range(max_seq_len):
    src = torch.LongTensor(indices).to(device)

    with torch.no_grad():
      prediction = model(src)

    # Method 1
    # idx = torch.argmax(prediction[-1])
    # itos = vocab.get_itos()
    # itos[idx]

    # Method 2
    probs = torch.softmax(prediction[-1]/temperature, dim=0)

    idx = vocab['<ukn>']
    while idx == vocab['<ukn>']:
      idx = torch.multinomial(probs, num_samples=1).item()

    token = itos[idx]
    prompt += ' ' + token

    if idx == vocab['.']:
      return prompt

    indices.append(idx)

  return prompt

In [None]:
prompt = 'In a galaxy far, far away, there'
prompt = 'The sun was setting in the'
prompt = 'Once upon a time, there lived a young princess named'
prompt = 'What is the meaning '

generate(prompt, 35, 0.5, model, tokenizer, vocab)