# Exercise 7: Encoder

Copy the data loading, vectorization and `SelfAttention` implementation from the previous exercises.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from gensim.downloader import load as gensim_load
from datasets import load_dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


## C.1 Multi-Head Attention

In [2]:
# You can reuse the `SelfAttention` class from Exercise 6, but you might need to extend it to support masking (C.1.1.v).
class SelfAttention(nn.Module):
    def __init__(self, d_model, d_k, mask=None):
        super().__init__()
        self.d_k = d_k
        self.WQ = nn.Linear(d_model, d_k)
        self.WK = nn.Linear(d_model, d_k)
        self.WV = nn.Linear(d_model, d_k)

    def forward(self, x, mask=None):
        Q = self.WQ(x)
        K = self.WK(x)
        V = self.WV(x)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        probabilities = F.softmax(scores, dim=-1)
        outputs = torch.matmul(probabilities, V)
        
            
        return outputs

In [3]:
# C.1.1 Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, d_k=None):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_k if d_k is not None else d_model // n_heads
        self.d_model = d_model
        
        self.WQ = nn.Linear(d_model, n_heads * self.d_k)
        self.WK = nn.Linear(d_model, n_heads * self.d_k)
        self.WV = nn.Linear(d_model, n_heads * self.d_k)
        
        self.WO = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size = x.size(0)
        seq_len = x.size(1)
        
        # x shape : (batch_size, seq_len, d_model)
        # Computing Q, K, V matrices. Shape: (batch_size, seq_len, n_heads * d_k)
        Q = self.WQ(x)
        K = self.WK(x)
        V = self.WV(x)
        
        # Reshape to (batch_size, seq_len, n_heads, d_k), so that we can have n_heads attention heads for parallel computation
        Q_reshaped = Q.view(batch_size, seq_len, self.n_heads, self.d_k)
        K_reshaped = K.view(batch_size, seq_len, self.n_heads, self.d_k)
        V_reshaped = V.view(batch_size, seq_len, self.n_heads, self.d_k)

        # Transpose to (batch_size, n_heads, seq_len, d_k), utile pour la multiplication matricielle apres
        Q_final = Q_reshaped.transpose(1, 2)
        K_final = K_reshaped.transpose(1, 2)
        V_final = V_reshaped.transpose(1, 2)
        
        # shape: (batch_size, n_heads, seq_len, seq_len), correspond a dire what is the score of each token with respect to each other token in the sequence
        scores = torch.matmul(Q_final, K_final.transpose(-2, -1)) / np.sqrt(self.d_k) 
        
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)  # Reshape mask to (batch_size, 1, 1, seq_len) as it was of shape (batch_size, seq_len)
            scores = scores.masked_fill(mask == 0, float('-inf'))
            
        probabilities = F.softmax(scores, dim=-1)
        outputs = torch.matmul(probabilities, V_final) # shape: (batch_size, n_heads, seq_len, d_k)
        outputs = outputs.transpose(1,2).contiguous()
        
        concatenated_outputs = outputs.view(batch_size, seq_len, self.d_model)
        final_output = self.WO(concatenated_outputs)
        
        return final_output

In [4]:
# C.1.2 Add Residual Connection and LayerNorm
class MultiHeadBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_k=None):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, d_k)
        self.norm1 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        x = self.attention(x, mask) + x # On a fait un + x pour avoir la Residual Connection
        x = self.norm1(x) # On normalise le output
        
        return x

## C.2 Feed-Forward Network

In [5]:
# C.2.1 Point-wise Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.expand_layer = nn.Linear(d_model, d_ff)
        self.activation = nn.ReLU()
        self.reduce_layer = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.expand_layer(x)
        x = self.activation(x)
        x = self.reduce_layer(x)
        return x

In [6]:
# C.2 
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.ff = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = self.ff(x) + x # Residual connection
        x = self.norm2(x) # Layer normalization
        return x

## Task 3: Encoder Block and Sentiment Classifier

In [None]:
# Implement the Encoder Block, i.e., the combination of MultiHeadBlock and FeedForwardBlock
class EncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, d_k=None):
        super().__init__()
        self.multi_head_block = MultiHeadBlock(d_model=d_model, n_heads = n_heads, d_k = d_k)
        self.feed_forward_block = FeedForwardBlock(d_model=d_model, d_ff = d_ff)

    def forward(self, x, mask=None):
        x = self.multi_head_block(x, mask)  # Multi-head attention block
        x = self.feed_forward_block(x)  # Feed-forward block
        return x

In [8]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, max_len = 512, d_k=None, num_classes=2, n_layers=2):
        super().__init__()
        
        self.n_layers = n_layers
        self.d_model = d_model
        self.word_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Embedding(max_len, d_model)  # Positional embeddings for sequence order
        
        self.encoder_blocks = nn.ModuleList(
            [EncoderBlock(d_model = d_model, n_heads = n_heads, d_ff = d_ff, d_k = d_k) for _ in range(n_layers)]
        )
        self.classifier = nn.Linear(d_model, num_classes)
        # todo

    def forward(self, x_ids, mask=None):
        batch_size, seq_len = x_ids.shape
        positions = torch.arange(0,seq_len, device=x_ids.device)
        x = self.word_embedding(x_ids) + self.positional_embedding(positions)
        for block in self.encoder_blocks:
            x = block(x, mask)
        x = x[:,0,:]  # Assuming we take the first token's representation for classification
        x = self.classifier(x)
        return x        

## Task 4: Data Preparation (same as in Exercise 6)

In [9]:
glove = gensim_load('glove-wiki-gigaword-100')

In [10]:
def load_imdb(n_samples=100):
    dataset = load_dataset("imdb", split="train")

    # How many samples per class
    n_per_class = n_samples // 2

    # Filter each class
    pos = dataset.filter(lambda x: x["label"] == 1).shuffle(seed=42).select(range(n_per_class))
    neg = dataset.filter(lambda x: x["label"] == 0).shuffle(seed=42).select(range(n_per_class))
    
    # Combine and shuffle
    balanced = concatenate_datasets([pos, neg]).shuffle(seed=42)

    texts = balanced["text"]
    labels = balanced["label"]
    return texts, labels

texts, labels = load_imdb(n_samples=10000)

In [11]:
from sklearn.model_selection import train_test_split

texts_list = list(texts)
labels_list = list(labels)

# Now your original code will work perfectly
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts_list, 
    labels_list, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels_list
)


In [12]:
def build_vocab(tokenized_texts):
    all_tokens = [token for text in tokenized_texts for token in text]
    unique_tokens = set(all_tokens)
    
    vocab = {
        '[PAD]': 0,
        '[UNK]': 1,
        '[CLS]': 2,
        '[SEP]': 3,
    }
    
    for i, token in enumerate(unique_tokens, start=4):
        vocab[token] = i

    return vocab

In [13]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize(text):
    return [w.lower() for w in word_tokenize(text)]

[nltk_data] Downloading package punkt to /home/twoface/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/twoface/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
def add_special_tokens(tokens, max_len=128):
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    if len(tokens) > max_len:
        tokens = tokens[:max_len]
    while len(tokens) < max_len:
        tokens.append("[PAD]")
    return tokens

In [15]:
def vectorize(tokens, max_len=128):
    embed_dim = glove.vector_size
    vecs = []
    for token in tokens[:max_len]:
        vec = glove[token] if token in glove else np.zeros(embed_dim)
        vecs.append(vec)
    while len(vecs) < max_len:
        vecs.append(np.zeros(embed_dim))
    return np.array(vecs)

In [16]:
def create_attention_mask(tokens):
    # tokens is the list of tokens *after* adding special tokens and padding
    return [1 if token != "[PAD]" else 0 for token in tokens]

In [17]:
def preprocess_text(text, vocab, max_len=128):
    tokens = tokenize(text)
    tokens = add_special_tokens(tokens, max_len)
    mask = create_attention_mask(tokens)
    ids = [vocab.get(token, vocab['[UNK]']) for token in tokens]
    return np.array(ids), np.array(mask)

def preprocess(texts, vocab, max_len=128):
    all_token_ids = []
    all_attention_masks = []
    
    for text in texts:
        tokens = tokenize(text)
        tokens = add_special_tokens(tokens, max_len)
        mask = create_attention_mask(tokens)
        all_attention_masks.append(mask)
        
        ids = [vocab.get(token, vocab['[UNK]']) for token in tokens]
        all_token_ids.append(ids)

    return np.array(all_token_ids), np.array(all_attention_masks)

In [18]:
# Create X and y
max_len = 128
vocab = build_vocab([tokenize(text) for text in train_texts])
train_token_ids, train_masks = preprocess(train_texts, vocab, max_len)
X_train = torch.tensor(train_token_ids, dtype=torch.long)
y_train = torch.tensor(train_labels, dtype=torch.long)
masks_train = torch.tensor(train_masks, dtype=torch.long)

val_token_ids, val_masks = preprocess(val_texts, vocab, max_len)
X_val = torch.tensor(val_token_ids, dtype=torch.long)
y_val = torch.tensor(val_labels, dtype=torch.long)
masks_val = torch.tensor(val_masks, dtype=torch.long)

## Task 5: Training

In [19]:
# Chose your model parameters
# Remember the rules of thumb:
# - d_ff = 4*d_model
# - d_k = d_model // n_heads 

vocab_size = len(vocab)
d_model = 128
n_heads = 8
d_ff = 4 * d_model
d_k = d_model // n_heads

In [20]:
# define the model, loss, optimizer
# todo
model = SentimentClassifier(vocab_size=vocab_size, d_model=d_model, n_heads=n_heads, d_ff=d_ff, d_k=d_k, n_layers=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentimentClassifier(
  (word_embedding): Embedding(62177, 128)
  (positional_embedding): Embedding(512, 128)
  (encoder_blocks): ModuleList(
    (0-2): 3 x EncoderBlock(
      (multi_head_block): MultiHeadBlock(
        (attention): MultiHeadAttention(
          (WQ): Linear(in_features=128, out_features=128, bias=True)
          (WK): Linear(in_features=128, out_features=128, bias=True)
          (WV): Linear(in_features=128, out_features=128, bias=True)
          (WO): Linear(in_features=128, out_features=128, bias=True)
        )
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (feed_forward_block): FeedForwardBlock(
        (ff): FeedForward(
          (expand_layer): Linear(in_features=128, out_features=512, bias=True)
          (activation): ReLU()
          (reduce_layer): Linear(in_features=512, out_features=128, bias=True)
        )
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (classifier): Linea

In [21]:
# print the total parameter count of the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params:,}")

bert_base_params = 110_000_000  # BERT base model has ~110M parameters
gpt_4_params = 1_700_000_000_000  # GPT-4 has ~1.7T parameters
print(f"Model parameters relative to BERT base: {100 * total_params / bert_base_params:.6f}%")
print(f"Model parameters relative to GPT-4: {100 * total_params / gpt_4_params:.6f}%")

Total trainable parameters: 8,619,266
Model parameters relative to BERT base: 7.835696%
Model parameters relative to GPT-4: 0.000507%


In [22]:
# define the training loop
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
NUM_EPOCHS = 5
BATCH_SIZE = 32
train_dataset = TensorDataset(X_train, masks_train, y_train)
val_dataset = TensorDataset(X_val, masks_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
model.train()

SentimentClassifier(
  (word_embedding): Embedding(62177, 128)
  (positional_embedding): Embedding(512, 128)
  (encoder_blocks): ModuleList(
    (0-2): 3 x EncoderBlock(
      (multi_head_block): MultiHeadBlock(
        (attention): MultiHeadAttention(
          (WQ): Linear(in_features=128, out_features=128, bias=True)
          (WK): Linear(in_features=128, out_features=128, bias=True)
          (WV): Linear(in_features=128, out_features=128, bias=True)
          (WO): Linear(in_features=128, out_features=128, bias=True)
        )
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (feed_forward_block): FeedForwardBlock(
        (ff): FeedForward(
          (expand_layer): Linear(in_features=128, out_features=512, bias=True)
          (activation): ReLU()
          (reduce_layer): Linear(in_features=512, out_features=128, bias=True)
        )
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (classifier): Linea

In [23]:
from sklearn.metrics import accuracy_score, f1_score

for epoch in range(NUM_EPOCHS):
    # Training phase
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{NUM_EPOCHS}"):
        batch_x = batch[0].to(device)  # Token IDs
        batch_masks = batch[1].to(device)  # Attention masks
        batch_y = batch[2].to(device)  # Labels
        
        optimizer.zero_grad()
        
        outputs = model(batch_x, mask=batch_masks)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{NUM_EPOCHS}"):
            batch_x = batch[0].to(device)
            batch_masks = batch[1].to(device)
            batch_y = batch[2].to(device)
            
            outputs = model(batch_x, mask=batch_masks)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            
            # Get predictions
            predictions = torch.argmax(outputs, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())
    
    avg_val_loss = val_loss / len(val_loader)
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}:")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f}")
    print(f"  Val Accuracy: {accuracy:.4f}")
    print(f"  Val F1-Score: {f1:.4f}")
    print("-" * 50)
    

Training Epoch 1/5: 100%|██████████| 250/250 [00:05<00:00, 46.18it/s]
Training Epoch 1/5: 100%|██████████| 250/250 [00:05<00:00, 46.18it/s]
Validation Epoch 1/5: 100%|██████████| 63/63 [00:00<00:00, 184.98it/s]
Validation Epoch 1/5: 100%|██████████| 63/63 [00:00<00:00, 184.98it/s]


Epoch 1/5:
  Train Loss: 0.6901
  Val Loss: 0.6726
  Val Accuracy: 0.5685
  Val F1-Score: 0.5057
--------------------------------------------------


Training Epoch 2/5: 100%|██████████| 250/250 [00:04<00:00, 50.56it/s]
Training Epoch 2/5: 100%|██████████| 250/250 [00:04<00:00, 50.56it/s]
Validation Epoch 2/5: 100%|██████████| 63/63 [00:00<00:00, 190.10it/s]
Validation Epoch 2/5: 100%|██████████| 63/63 [00:00<00:00, 190.10it/s]


Epoch 2/5:
  Train Loss: 0.6363
  Val Loss: 0.6186
  Val Accuracy: 0.6455
  Val F1-Score: 0.6286
--------------------------------------------------


Training Epoch 3/5: 100%|██████████| 250/250 [00:04<00:00, 54.22it/s]
Training Epoch 3/5: 100%|██████████| 250/250 [00:04<00:00, 54.22it/s]
Validation Epoch 3/5: 100%|██████████| 63/63 [00:00<00:00, 202.05it/s]
Validation Epoch 3/5: 100%|██████████| 63/63 [00:00<00:00, 202.05it/s]


Epoch 3/5:
  Train Loss: 0.5733
  Val Loss: 0.5616
  Val Accuracy: 0.7030
  Val F1-Score: 0.7019
--------------------------------------------------


Training Epoch 4/5: 100%|██████████| 250/250 [00:04<00:00, 54.37it/s]
Training Epoch 4/5: 100%|██████████| 250/250 [00:04<00:00, 54.37it/s]
Validation Epoch 4/5: 100%|██████████| 63/63 [00:00<00:00, 197.15it/s]



Epoch 4/5:
  Train Loss: 0.5100
  Val Loss: 0.5589
  Val Accuracy: 0.7120
  Val F1-Score: 0.7097
--------------------------------------------------


Training Epoch 5/5: 100%|██████████| 250/250 [00:04<00:00, 53.78it/s]
Training Epoch 5/5: 100%|██████████| 250/250 [00:04<00:00, 53.78it/s]
Validation Epoch 5/5: 100%|██████████| 63/63 [00:00<00:00, 190.00it/s]

Epoch 5/5:
  Train Loss: 0.4433
  Val Loss: 0.5859
  Val Accuracy: 0.7085
  Val F1-Score: 0.7048
--------------------------------------------------





## Task 6: Inference

In [24]:
def predict_review_sentiment(text, model = model):
    model.eval()
    with torch.no_grad():
        token_ids, attention_mask = preprocess_text(text, vocab, max_len=128)
        token_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0).to(device)
        outputs = model(token_ids, mask=attention_mask)
        probabilities = F.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        
    prediction = "Positive" if predicted_class == 1 else "Negative"
    probability = probabilities[0][predicted_class].item()
    return prediction, probability

In [25]:
predict_review_sentiment("I love this movie! It's fantastic and uplifting.")  # Should return "Positive"

('Positive', 0.9527169466018677)

In [26]:
predict_review_sentiment("This movie was terrible. I hated it.")  # Should return "Negative"

('Negative', 0.9466676115989685)

In [27]:
predict_review_sentiment("I didn't like the ending, but I love the music.") # Mixed sentiment

('Positive', 0.8370497822761536)

In [28]:
predict_review_sentiment("I didn't like the ending.")

('Negative', 0.8841147422790527)

In [29]:
predict_review_sentiment("i love the music.")

('Positive', 0.7118408679962158)

In [30]:
predict_review_sentiment("but i love the music.")

('Positive', 0.934095561504364)

In [31]:
predict_review_sentiment("i love")

('Positive', 0.849819540977478)

In [38]:
predict_review_sentiment("well... I guess it was okay")

('Positive', 0.5293876528739929)

In [33]:
predict_review_sentiment("movie was terrible, very bad")

('Negative', 0.9960319399833679)