
<br>
<font>
<div dir=ltr align=center>
<font color=0F5298 size=10>
    Deep Learning - HW4 <br>
<font color=2565AE size=5>
    Electrical Engineering Department <br>
    winter 2024<br>
<font color=3C99D size=5>
    Practical Assignment 2 <br>
<font color=696880 size=4>
    Armin Ghojehzadeh 

____

# 🔴 **Import Libs**

In [None]:
!pip uninstall torchtext -y
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install --upgrade torchdata
!pip install --upgrade torchtext --index-url https://download.pytorch.org/whl/cu118
!pip install torchmetrics

[0mLooking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (838.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m838.3/838.3 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.20.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cud

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext
import torchdata

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from torch import optim
from torch.nn import functional as F

import tqdm
import torchmetrics as tm

In [None]:
!python --version
print(torch.__version__)
print(torchtext.__version__)
print(torchdata.__version__)

In [None]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

# 🔴 **Utils**

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

# 🔴 **Dataset**

## 🟠 **Load the Dataset**

🔰 In this session you should load WikiText2 dataset.

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']
print(train_data[1]) 

## 🟠 **Build vocabulary and save it**

🔰 In this section we need to:

*   Define a tokenizer using `basic_english`
*   Tokenize the dataset and collect tokens
*   Build the vocabulary using `build_vocab_from_iterator`
*   Manually insert special tokens and set the default index


In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def yield_tokens(dataset_split):
    for example in dataset_split:
        yield tokenizer(example["text"])

train_tokens = yield_tokens(train_data)

vocab = build_vocab_from_iterator(train_tokens, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])

print("Special Tokens and Their Indices:")
print(f"<unk>: {vocab['<unk>']}")
print(f"<pad>: {vocab['<pad>']}")
print(f"<bos>: {vocab['<bos>']}")
print(f"<eos>: {vocab['<eos>']}")


sample_text = "This is an example sentence."
tokenized_text = tokenizer(sample_text)
indexed_text = [vocab[token] for token in tokenized_text]

print("\nTokenized Text:", tokenized_text)
print("Indexed Text:", indexed_text)

## 🟠 EDA

### 🟡 Let's explore the WikiText2 dataset!

### 🟡 Calculate basic statistics such as the number of documents, total words, average document length, etc.

In [None]:
def calculate_statistics(dataset_split):
    num_documents = len(dataset_split)
    total_words = 0
    total_lengths = []

    for doc in dataset_split:
        tokens = tokenizer(doc["text"])
        total_words += len(tokens)
        total_lengths.append(len(tokens))

    avg_doc_length = total_words / num_documents if num_documents > 0 else 0

    return {
        "num_documents": num_documents,
        "total_words": total_words,
        "avg_doc_length": avg_doc_length,
        "min_doc_length": min(total_lengths) if total_lengths else 0,
        "max_doc_length": max(total_lengths) if total_lengths else 0,
    }

train_stats = calculate_statistics(train_data)
print("Training Set Statistics:")
for stat, value in train_stats.items():
    print(f"{stat}: {value}")

### 🟡 Analyze the most common and least common words in the dataset.

In [None]:
from collections import Counter

def count_words(dataset_split):
    word_counter = Counter()

    for doc in dataset_split:
        tokens = tokenizer(doc["text"])
        word_counter.update(tokens)

    return word_counter

train_word_counts = count_words(train_data)
most_common_words = train_word_counts.most_common(10)
least_common_words = [word for word, count in train_word_counts.items() if count == 1]

print("Most Common Words:")
for word, count in most_common_words:
    print(f"{word}: {count}")

print("\nNumber of Least Common Words (occurring once):", len(least_common_words))
print("Example of Least Common Words:", least_common_words[:10])


### 🟡  Please proceed with further exploration of the dataset. what do you suggest?

In [None]:
word_frequencies = np.array(list(train_word_counts.values()))

plt.figure(figsize=(10, 6))
plt.hist(word_frequencies, bins=50, log=True, color='blue', alpha=0.7)
plt.title("Word Frequency Distribution (Log Scale)")
plt.xlabel("Frequency")
plt.ylabel("Number of Words")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

sorted_frequencies = np.sort(word_frequencies)[::-1]
ranks = np.arange(1, len(sorted_frequencies) + 1)

plt.figure(figsize=(10, 6))
plt.loglog(ranks, sorted_frequencies, marker="o", linestyle="", markersize=3, color='red')
plt.title("Word Frequencies (Log-Log Plot)")
plt.xlabel("Rank")
plt.ylabel("Frequency")
plt.grid(which='both', linestyle='--', alpha=0.7)
plt.show()

## 🟠 Transform the data

🛑 Make sure to perform the transformations on train, validation and test datasets.

🔰 Reshape the dataset into an `N x B x L` or `M x L` format, where `N` represents the number of batches, `B` is the batch size, `L` is the length of a sample within each batch, and `M` is equal to `N x B`.

In [None]:
from torch.nn.utils.rnn import pad_sequence

def data_process(raw_text_iter, batch_size, seq_len, vocab, tokenizer):
    tokens = []
    for line in raw_text_iter:
        tokens.extend(tokenizer(line))
    
    token_indices = torch.tensor([vocab[token] for token in tokens if token in vocab])
    
    total_tokens = len(token_indices)
    num_samples = total_tokens // seq_len
    token_indices = token_indices[:num_samples * seq_len]
    data = token_indices.clone().detach().view(num_samples, seq_len)

    inputs = data[:, :-1]
    targets = data[:, 1:]
    return inputs, targets

In [None]:
batch_size = 32
seq_len = 50

train_raw_iter = (line["text"] for line in train_data)
val_raw_iter = (line["text"] for line in val_data)
test_raw_iter = (line["text"] for line in test_data)

train_inputs, train_targets = data_process(train_raw_iter, batch_size, seq_len, vocab, tokenizer)
val_inputs, val_targets = data_process(val_raw_iter, batch_size, seq_len, vocab, tokenizer)
test_inputs, test_targets = data_process(test_raw_iter, batch_size, seq_len, vocab, tokenizer)

print("Inputs shape (N x B x L):", train_inputs.shape)
print("Targets shape (N x B x L):", train_targets.shape)


## 🟠 Custom dataset

🔰 Write a custom dataset class for LanguageModelDataset.

In [None]:
class LanguageModelDataset(Dataset):

  def __init__(self, inputs, targets):
    self.inputs = inputs
    self.targets = targets

  def __len__(self):
    return self.inputs.shape[0]

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]


In [None]:
train_dataset = LanguageModelDataset(train_inputs, train_targets)
val_dataset = LanguageModelDataset(val_inputs, val_targets)
test_dataset = LanguageModelDataset(test_inputs, test_targets)

print(f"Dataset length: {len(train_dataset)}")
print("Sample input-target pair:")
print(train_dataset[0])

## 🟠 Define a dataloader if needed

🔰 Write dataloaders for the training, validation, and test sets.

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# 🔴 **Model**

🔰 Use the following template to create a custom model.

Your model should consist of three parts:

*   an embedding layer
*   an LSTM layer
*   a fully connected layer

In [None]:
class LanguageModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate):
    super(LanguageModel, self).__init__()
      
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(
        input_size=embedding_dim,
        hidden_size=hidden_dim,
        num_layers=num_layers,
        dropout=dropout_rate,
        batch_first=True
    )
    self.fc = nn.Linear(hidden_dim, vocab_size)
    self.dropout = nn.Dropout(dropout_rate)
    

  def forward(self, src):
    embedded = self.embedding(src)
    lstm_out, _ = self.lstm(embedded)
    lstm_out = self.dropout(lstm_out)
    output = self.fc(lstm_out)
    return output

In [None]:
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 256
num_layers = 2
dropout_rate = 0.5

model = LanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate)
print(model)

src, _ = next(iter(train_loader))
output = model(src)
print("Output shape:", output.shape)

# 🔴 **Config**

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

🔰 Define the optimizer, loss function, metrics and other necessary parameters in this section, and ensure the model is sent to the appropriate device.

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
accuracy = tm.Accuracy(task="multiclass", num_classes=vocab_size).to(device)
perplexity = tm.text.Perplexity().to(device)

print("Model Summary:")
print(model)
print(f"Optimizer: {optimizer}")
print(f"Loss Function: {criterion}")
print(f"Metric: {perplexity}")

# 🔴 **Train ➰**

🔰 This is the template for train function, change it if needed.

In [None]:
scaler = torch.cuda.amp.GradScaler()

def train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch=None):
  model.train()
  loss_train = AverageMeter()
  metric.reset()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)
        
      outputs_2d = outputs.view(-1, outputs.size(-1))  
      targets_2d = targets.view(-1)

      loss = loss_fn(outputs_2d, targets_2d)

      loss.backward()

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))
        
      logit = F.log_softmax(outputs, dim=-1)
      metric.update(logit, targets)

      tepoch.set_postfix(loss=loss_train.avg, metric=metric.compute().item())

  return model, loss_train.avg, metric.compute().item()

# 🔴 **Evaluation**

🔰 This is the template for evaluation function, change it if needed.

In [None]:
def evaluate(model, test_loader, loss_fn, metric):
  model.eval()
  loss_eval = AverageMeter()
  metric.reset()

  with torch.inference_mode():
    for inputs, targets in test_loader:
      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)
        
      outputs_2d = outputs.view(-1, outputs.size(-1))  
      targets_2d = targets.view(-1) 

      loss = loss_fn(outputs_2d, targets_2d)
      loss_eval.update(loss.item(), n=len(targets))

      logit = F.log_softmax(outputs, dim=-1)
      metric.update(logit, targets)

  return loss_eval.avg, metric.compute().item()

# 🔴 **Training Process 〽️**

## 🟠 Finding Hyper-parameters

### 🟡 **Step 1:** Calculate the loss for an untrained model using a few batches.


In [None]:
model = model.to(device)
model.eval()

inputs, targets = next(iter(train_loader))
inputs = inputs.to(device)
targets = targets.to(device)

with torch.no_grad():
  outputs = model(inputs)
  
  outputs_2d = outputs.view(-1, outputs.size(-1))  
  targets_2d = targets.view(-1) 
    
  print(outputs.shape, targets.shape)
    
  loss = criterion(outputs_2d, targets_2d)

print(loss)

### 🟡 **Step 2:** Try to train and overfit the model on a small subset of the dataset.

In [None]:
model = model.to(device)

In [None]:
train_subset = torch.utils.data.Subset(train_dataset, indices=range(256))
train_subset_loader = DataLoader(train_subset, batch_size=32, shuffle=True)

In [None]:
num_epochs = 30

for epoch in range(num_epochs):
  model, train_loss, train_perplexity = train_one_epoch(model, train_subset_loader, criterion, optimizer, perplexity, epoch)
  val_loss, val_perplexity = evaluate(model, val_loader, criterion, perplexity)

  print(f"Epoch {epoch} -- Train Loss: {train_loss:.4f} -- Train Perplexity: {train_perplexity:.4f} -- Val Loss: {val_loss:.4f} -- Val Perplexity: {val_perplexity:.4f}")

### 🟡 **Step 3:** Train the model for a limited number of epochs, experimenting with various learning rates.

In [None]:
num_epochs = 10

for lr in [1e-2, 1e-3, 1e-4]:
  print(f'LR={lr}')

  model = LanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate).to(device)
  optimizer = optim.SGD(model.parameters(), lr=lr)
  criterion = nn.CrossEntropyLoss()
  perplexity = tm.text.Perplexity().to(device)

  for epoch in range(num_epochs):
    model, train_loss, train_perplexity = train_one_epoch(model, train_loader, criterion, optimizer, perplexity, epoch)
    val_loss, val_perplexity = evaluate(model, val_loader, criterion, perplexity)

    print(f"Epoch {epoch} -- Train Loss: {train_loss:.4f} -- Train Perplexity: {train_perplexity:.4f} -- Val Loss: {val_loss:.4f} -- Val Perplexity: {val_perplexity:.4f}")

  print(f"Val Perplexity: {val_perplexity:.4f}")

### 🟡 Step 4: Create a small grid using the weight decay and the best learning rate.





In [None]:
num_epochs = 10

for lr in [1e-2, 1e-3, 1e-4]:
  for wd in [1e-2, 1e-3, 1e-4]:
    print(f'LR={lr}, WD={wd}')

    model = LanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate)
    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd)

    for epoch in range(num_epochs):
      model, train_loss, train_perplexity = train_one_epoch(model, train_subset_loader, criterion, optimizer, perplexity, epoch)
      val_loss, val_perplexity = evaluate(model, val_loader, criterion, accuracy)

      print(f"Epoch {epoch} -- Train Loss: {train_loss:.4f} -- Train Perplexity: {train_perplexity:.4f} -- Val Loss: {val_loss:.4f} -- Val Perplexity: {val_perplexity:.4f}")

  print(f"Val Perplexity: {val_perplexity:.4f}")

### 🟡 Step 5: Train model for longer epochs using the best model from step 4.





In [None]:
model = LanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate).to(device)

In [None]:
lr = 1e-3
wd = 1e-3
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9, nesterov=True)

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                 train_loader,
                                                 criterion,
                                                 optimizer,
                                                 perplexity,
                                                 epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                     val_loader,
                                     criterion,
                                     perplexity)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

## 🟠 Main Loop

🔰 Define model.

In [None]:
model = torch.load('model.pt').to(device)

🔰 Define optimizer and Set learning rate and weight decay.

In [None]:
lr = 1e-3
wd = 1e-3
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9, nesterov=True)

🔰 Write code to train the model for `num_epochs` epoches.

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                 train_loader,
                                                 criterion,
                                                 optimizer,
                                                 perplexity,
                                                 epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                     val_loader,
                                     criterion,
                                     perplexity)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

## 🟠 Plot

🔰 Plot learning curves

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(range(epoch_counter), loss_train_hist, 'r-', label='Train')
plt.plot(range(epoch_counter), loss_valid_hist, 'b-', label='Validation')

plt.xlabel('Epoch')
plt.ylabel('loss')
plt.grid(True)
plt.legend()

# 🔴 **Test**

🔰 Test your model using data from the test set

In [None]:
test_loss, test_metric = evaluate(model, test_loader, criterion, perplexity)
print(f'Test: Loss = {test_loss:.4}, Metric = {test_metric:.4}')

# 🔴 **Generate**

🔰 Your mission is to write a `generate` function and use a desired sentence to evaluate the model

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):

    if seed is not None:
        torch.manual_seed(seed)
    
    model.eval()
    
    with torch.no_grad():
        tokens = tokenizer(prompt)
        token_indices = [vocab[token] for token in tokens if token in vocab]
        token_tensor = torch.tensor(token_indices).unsqueeze(0)
    
        for _ in range(max_seq_len):
            output = model(token_tensor)
            output_probs = F.softmax(output[:, -1, :] / temperature, dim=-1)
            token_probs, token_indices = output_probs.topk(1)
    
            token_index = token_indices.item()
            token_tensor = torch.cat([token_tensor, torch.tensor([[token_index]])], dim=-1)
        
            if token_index == vocab['<eos>']:
                break
        
    generated_tokens = [vocab.get_itos()[idx] for idx in token_tensor.squeeze(0).numpy()]
    generated_text = tokenizer.convert_tokens_to_string(generated_tokens)
    
    return generated_text

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):
    if seed is not None:
        torch.manual_seed(seed)

    model.eval()
    
    # Tokenize the prompt and convert to tensor
    tokens = tokenizer(prompt)  # e.g., ["This", "is", "a"]
    token_indices = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0).to(next(model.parameters()).device)
    
    generated = token_indices.clone()  # Start with the prompt
    
    with torch.no_grad():
        for _ in range(max_seq_len):
            logits = model(generated)
            
            logits = logits[:, -1, :]  # Shape: [1, vocab_size]
            
            logits = logits / temperature
            
            probabilities = F.softmax(logits, dim=-1)
            
            next_token_index = torch.multinomial(probabilities, num_samples=1).item()
            
            generated = torch.cat([generated, torch.tensor([[next_token_index]], device=generated.device)], dim=1)
            
            if next_token_index == vocab["<eos>"]:
                break

    generated_text = " ".join([vocab.lookup_token(idx) for idx in generated.squeeze().tolist()])
    return generated_text