In [None]:
!pip install torch
!pip install transformers
!pip install datasets
!pip install tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from transformers import AutoTokenizer
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForSequenceClassification

In [None]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = {
    'seed': 42,      # Your seed number, you can pick your lucky number. :)
    'batch_size': 16,
    'learning_rate': 1e-5,
    'nhead': 4,
    'd_model': 256,
    'dim_feedforward': 256,
    '_n_data': 64

}
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(config['seed'])
torch.manual_seed(config['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config['seed'])

## Dataset
|text | label|
|----|----|
|I rented I AM CURIOUS-YELLOW from my video sto...|      0|
|"I Am Curious: Yellow" is a risible and preten... |     0|


- [Kaggle](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
- [Paper](https://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)

In [None]:
dataset = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Display the structure of the dataset
print(dataset)

# Convert the training and test data to pandas DataFrames
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

# Display the structure of the training DataFrame
print("Training DataFrame structure:")
print(train_df.head())

# Display the structure of the test DataFrame
print("Test DataFrame structure:")
print(test_df.head())

# You can also check the size of the dataset
print(f"Training dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
Training DataFrame structure:
                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
Test DataFrame structure:
                                                text  label
0  I love sci-fi and am willing to put up with a ...      0
1  Worth the entertainment value of a rental, esp...      0
2  its a totally average film with a few semi-alr...      0
3  STAR RATING: ***** Saturday Night **** Friday ...  

## Tokenizer
Tokenizer is a tool or algorithm used in natural language processing (NLP) to break down text into smaller units called tokens. These tokens can be words, subwords, or characters depending on the tokenization approach. The choice of tokenizer **depends on the specific NLP task** and the nature of the language being processed.

Example:
Text: “How are you”

- Character Tokenization -> [ “H”, “o”, “w”, “ ”, “a”, “r”, “e”, “ ”, “y”, “o”,”u”]
- Word Tokenization -> [“How”, “are”, “you”]
- Sentence Tokenization -> [“How are you”]
- Subword Tokenization -> [“How”, “ a”, “re”, “ y”, “ou”]

Please refer to [BPE](https://platform.openai.com/tokenizer).

In [None]:
# load tokenizer => coresponding to training model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


small_train_dataset = tokenized_datasets["train"].shuffle(seed=config['seed']).select(range(config['_n_data']))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=config['seed']).select(range(config['_n_data']))

print("Vocabulary size:", len(tokenizer))

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Vocabulary size: 30522


In [None]:
# def collate_fn(batch):
#     # This function processes a batch of data, stacking tensors and performing padding if necessary.
#     input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
#     attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
#     labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)

#     return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
from torch.nn.utils.rnn import pad_sequence
import torch

def my_collate(batch):
    # This function processes a batch of data, stacking tensors and performing padding if necessary.
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["label"] for item in batch]

    input_ids_padded = pad_sequence([torch.tensor(ids, dtype=torch.long) for ids in input_ids], batch_first=True, padding_value=0)
    attention_mask_padded = pad_sequence([torch.tensor(mask, dtype=torch.long) for mask in attention_mask], batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.long)

    return {"input_ids": input_ids_padded, "attention_mask": attention_mask_padded, "labels": labels}


train_dataloader = DataLoader(small_train_dataset, batch_size=config['batch_size'], collate_fn=my_collate)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=config['batch_size'], collate_fn=my_collate)

# Print out the data size.
first_batch = next(iter(train_dataloader))

# print shape
print("(batch_size, sequence_length) vector")
print(f"input_ids shape: {first_batch['input_ids'].size()}")
print(f"attention_mask shape: {first_batch['attention_mask'].size()}")
print(f"labels shape: {first_batch['labels'].size()}")

(batch_size, sequence_length) vector
input_ids shape: torch.Size([16, 512])
attention_mask shape: torch.Size([16, 512])
labels shape: torch.Size([16])


In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [None]:
tokenized_datasets.set_format(type="pandas")
df = tokenized_datasets['train'][:]
display(df.head())
tokenized_datasets.reset_format()

Unnamed: 0,text,label,input_ids,token_type_ids,attention_mask
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,"[101, 1045, 12524, 1045, 2572, 8025, 1011, 375...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"""I Am Curious: Yellow"" is a risible and preten...",0,"[101, 1000, 1045, 2572, 8025, 1024, 3756, 1000...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,If only to avoid making this type of film in t...,0,"[101, 2065, 2069, 2000, 4468, 2437, 2023, 2828...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,This film was probably inspired by Godard's Ma...,0,"[101, 2023, 2143, 2001, 2763, 4427, 2011, 2643...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"Oh, brother...after hearing about this ridicul...",0,"[101, 2821, 1010, 2567, 1012, 1012, 1012, 2044...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


| Special Token | Token ID (BERT) | Description                                  |
|---------------|-----------------|----------------------------------------------|
| `[PAD]`       | 0               | Padding token, used to pad sequences to the same length. |
| `[UNK]`       | 100             | Unknown token, used for words not in the vocabulary. |
| `[CLS]`       | 101             | Classification token, used at the start of a sentence for classification tasks. |
| `[SEP]`       | 102             | Separator token, used to separate two sequences or mark the end of a sequence. |
| `[MASK]`      | 103             | Mask token, used in masked language modeling to represent a missing word. |


In [None]:
next(iter(tokenized_datasets["train"]))

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

### Embedding
Example of tokenizing a sentence using the pre-trained BERT model 'bert-base-uncased.'
Please refer to [CSDN](https://blog.csdn.net/zhaohongfei_358/article/details/122809709) for more details.


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
embedding_layer = model.get_input_embeddings()

# Get the token ID for '[PAD]'
pad_token_id = tokenizer.pad_token_id  # this is typically 0 for BERT
pad_embedding = embedding_layer.weight[pad_token_id]

# Print the embedding vector for '[PAD]' token
print(pad_embedding.size())
print(f"Embedding for [PAD] token (token ID {pad_token_id}):\n{pad_embedding}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
Embedding for [PAD] token (token ID 0):
tensor([-1.0183e-02, -6.1549e-02, -2.6497e-02, -4.2061e-02,  1.1672e-03,
        -2.8272e-02, -4.4500e-02, -2.2465e-02, -4.6553e-03, -8.2129e-02,
        -5.0238e-03, -4.6508e-02, -4.9514e-02,  2.1517e-02, -1.6588e-02,
        -3.7279e-02, -7.2888e-02, -4.6671e-02,  1.9787e-03, -5.5847e-02,
        -2.8919e-02, -2.2304e-02, -4.4846e-03, -1.5506e-02, -1.0986e-01,
        -2.6746e-02,  8.3565e-03, -5.3755e-02,  8.1516e-03, -2.5817e-02,
        -2.8301e-02, -2.6342e-03, -1.7270e-02, -1.7444e-02, -5.0403e-02,
        -5.4036e-02, -3.3925e-02, -1.9397e-02, -6.2235e-02, -1.9178e-03,
        -3.0086e-02, -3.1459e-02, -5.0693e-02, -1.8174e-02,  6.8573e-03,
        -8.9839e-03, -1.1808e-02, -3.2866e-02, -3.8003e-03, -2.7472e-02,
        -3.3144e-02, -1.6076e-02, -5.8682e-02,  1.0107e-01, -2.9100e-02,
        -2.4062e-02, -1.5432e-02,  5.2106e-03, -2.3103e-03,  4.4728e-03,
        -1.1664e-02, -1.4309e-02,  1.0915e-01, -4.0001e-02, -2.907

## Training Loop

In [None]:
criterion = nn.CrossEntropyLoss()

def train(model, train_dataloader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch in tqdm(train_dataloader, desc="Training", leave=True):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask=attention_mask)

        # get the model output
        if isinstance(output, dict):
            # this is for transformer
            logits = output.get("logits", None)
        else:
            # this is for user defined
            logits = output

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        _, predicted = torch.max(logits, 1)
        correct = (predicted == labels).sum().item()
        epoch_acc += correct / len(labels)

    return epoch_loss / len(train_dataloader), epoch_acc / len(train_dataloader)

def evaluate(model, eval_dataloader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating", leave=True):
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            output = model(input_ids, attention_mask=attention_mask)
            # get the model output
            if isinstance(output, dict):
                # this is for transformer
                logits = output.get("logits", None)
            else:
                # this is for user defined
                logits = output
            loss = criterion(logits, labels)

            epoch_loss += loss.item()

            _, predicted = torch.max(logits, 1)
            correct = (predicted == labels).sum().item()
            epoch_acc += correct / len(labels)

    return epoch_loss / len(eval_dataloader), epoch_acc / len(eval_dataloader)

## Model

### Model_A: Encoder-Decoder Model
Please refer to [知乎](https://zhuanlan.zhihu.com/p/338817680) or [github](https://github.com/hyunwoongko/transformer) for more details. In this [知乎](https://zhuanlan.zhihu.com/p/338817680), there is a detailed step-by-step diagram specifically explaining the autoregressive process of the decoder.
- The decoder's `src` is the output from the encoder (the source sequence), while the `tgt` is the target sequence that the decoder generates step by step.
- It keeps generating tokens autoregressively until it encounters the `[EOS]` token, using previously generated tokens as input for the next step.
- Masking is applied to ensure the model only attends to past tokens, preventing it from looking ahead at future tokens during this process.

In [None]:
import torch
import torch.nn as nn

class EncoderDecoderClassifier(nn.Module):
    """
    Encoder-Decoder seq2label model: No auto regression
    vocab_size: It represents the total number of unique tokens (words, subwords, or characters) which depends on the model.
    d_model: the dimensionality of the token embeddings (i.e., each token is represented as a d_model-dimensional vector.
    dim_feedforward: It defines the size of the hidden layer(s) in the feedforward neural network that follows the attention mechanism within each Transformer layer.
    nhead: It specifies the number of attention heads in the multi-head self-attention mechanism, allowing the model to focus on different parts of the input sequence simultaneously.
    """
    def __init__(self,
                 vocab_size,
                 d_model=256,
                 nhead=4,
                 dim_feedforward=256,
                 dropout=0.1,
                 num_labels=2) -> None:
        super().__init__()

        # embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Transformer encoder
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, dim_feedforward=dim_feedforward, nhead=nhead, batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

        # Transformer decoder
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, dim_feedforward=dim_feedforward, nhead=nhead, batch_first=True)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=2)

        # final classifer
        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, num_labels),
        )

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None):
        # embedding
        # input_ids: (batch_size, seq_length)
        embedded = self.embedding(input_ids) # (batch_size, seq_length, d_model)
        embedded = embedded.permute(1, 0, 2)  # (seq_length, batch_size, d_model)

        # 1. Transformer encoder
        encoded = self.encoder(embedded) # (seq_length, batch_size, d_model)

        # 2. embedding
        if decoder_input_ids is None:
            # artificial prepare target sequence
            decoder_input_ids = input_ids

        decoder_embedded = self.embedding(decoder_input_ids) # (batch_size, seq_length, d_model)
        decoder_embedded = decoder_embedded.permute(1, 0, 2)  # (seq_length, batch_size, d_model)

        # 3. Transformer decoder
        # src: encoded
        # tgt: decoder_embedded
        decoded = self.decoder(decoder_embedded, encoded) # (seq_length, batch_size, d_model)

        # 4. artificial pool
        pooled = decoded.mean(dim=0)

        # 5. predict
        output = self.pred_layer(pooled)
        return output


### Model_C: Decoder Model

In [None]:
# Define the Single Head Attention mechanism for the Transformer
class SingleHeadAttention(nn.Module):
    def __init__(self, d_model: int, attention_dim: int):
        super().__init__()
        # d_model = embedded_dim
        # Linear layers to generate key, query, and value vectors for attention
        self.key_gen = nn.Linear(d_model, attention_dim, bias=False)
        self.query_gen = nn.Linear(d_model, attention_dim, bias=False)
        self.value_gen = nn.Linear(d_model, attention_dim, bias=False)

    def forward(self, embedded: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        embedded: (batch_size, context_length, d_model)
        attention_mask: (batch_size, context_length)
        """
        # Generate key, query, and value vectors for self-attention
        k = self.key_gen(embedded)
        q = self.query_gen(embedded)
        v = self.value_gen(embedded)

        # Calculate attention scores (Q * K.T) and scale by sqrt(attention_dim)
        # (batch_size, context_length, context_length)
        scores = torch.matmul(q, k.transpose(-1, -2)) / (k.shape[-1] ** 0.5)

        # Mask out future tokens (for causal attention in autoregressive models)
        lower_tri = torch.tril(torch.ones(context_length, context_length)).to(device)
        scores = scores.masked_fill(lower_tri == 0, float('-inf'))
        # Mask the attention scores based on attention_mask (for padding)
        # scores = scores.masked_fill(attention_mask == 0, float('-inf')) # (batch_size, context_length, context_length)

        # Calculate the attention weights by applying softmax
        attention_weights = torch.softmax(scores, dim=-1)

        # Apply attention weights to the value vectors (Weighted sum of V)
        # (batch_size, context_length, attention_dim)
        return torch.matmul(attention_weights, v)


# Define Multi-Head Self Attention mechanism for Transformer
class MultiHeadedSelfAttention(nn.Module):
    def __init__(self, d_model: int, nhead: int):
        super().__init__()
        # Create multiple single head attention layers
        self.att_heads = nn.ModuleList([SingleHeadAttention(d_model, d_model // nhead) for _ in range(nhead)])

    def forward(self, embedded: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        embedded: (batch_size, context_length, d_model)
        attention_mask: (batch_size, context_length)
        """
        # Expand the attention mask to match the shape of attention scores
        # attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # Shape (batch_size, 1, 1, context_length)
        attention_mask = (attention_mask == 0).float()  # Mark padding positions as 1 for masking

        # Apply each attention head and concatenate the results
        head_outputs = [head(embedded, attention_mask) for head in self.att_heads]
        return torch.cat(head_outputs, dim=-1)


# Define the Feed-Forward Neural Network (FFN) in Transformer
class VanillaNeuralNetwork(nn.Module):
    def __init__(self, d_model: int, dim_feedforward=256, dropout=0.1):
        super().__init__()
        # Projection layers in the feed-forward network
        self.up_projection = nn.Linear(d_model, dim_feedforward)  # Upscale the dimension
        self.relu = nn.ReLU()
        self.down_projection = nn.Linear(dim_feedforward, d_model)  # Downscale the dimension
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Feed input through the network with ReLU activation and dropout
        return self.dropout(self.down_projection(self.relu(self.up_projection(x))))


# Define a single Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, d_model: int, nhead: int, dim_feedforward=256, dropout=0.1):
        super().__init__()
        # Instantiate multi-head self-attention and feed-forward network
        self.attention = MultiHeadedSelfAttention(d_model, nhead)
        self.linear_network = VanillaNeuralNetwork(d_model, dim_feedforward, dropout)
        self.first_norm = nn.LayerNorm(d_model)  # Layer normalization before attention
        self.second_norm = nn.LayerNorm(d_model)  # Layer normalization before feed-forward network

    def forward(self, embedded: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        # Add the output from attention to the input (residual connection)
        embedded = embedded + self.attention(self.first_norm(embedded), attention_mask)
        # Add the output from the feed-forward network to the input (residual connection)
        embedded = embedded + self.linear_network(self.second_norm(embedded))
        return embedded


# Define the main GUPT model (GPT model without pre-trained)
class GUPT(nn.Module):
    """
    Generative un-pre-trained transformer (decoder)
    vocab_size: It represents the total number of unique tokens (words, subwords, or characters) which depends on the model.
    d_model: the dimensionality of the token embeddings (i.e., each token is represented as a d_model-dimensional vector.
    nhead: It specifies the number of attention heads in the multi-head self-attention mechanism, allowing the model to focus on different parts of the input sequence simultaneously.
    """
    def __init__(self,
                 vocab_size: int,
                 context_length: int,
                 d_model=256,
                 nhead=4,
                 dim_feedforward=256,
                 dropout=0.1,
                 num_blocks=6,
                 num_labels=2) -> None:
        super().__init__()
        # Word embedding layer
        self.word_embeddings = nn.Embedding(vocab_size, d_model)
        # Position embedding layer
        self.position_embeddings = nn.Embedding(context_length, d_model)

        # Set up the Transformer blocks
        self.transformer_blocks = nn.ModuleList([TransformerBlock(d_model, nhead, dim_feedforward, dropout) for _ in range(num_blocks)])

        # Layer normalization and classifier for final output
        self.final_norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, d_model)

        # Prediction layer (final classification layer)
        self.pred_layer = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, num_labels),
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:

        # Calculate word and position embeddings
        embedded = self.word_embeddings(input_ids)
        context_length = input_ids.shape[1]
        positions = torch.arange(context_length, device=input_ids.device)
        embedded = embedded + self.position_embeddings(positions)

        # Pass through all the Transformer blocks
        for block in self.transformer_blocks:
            embedded = block(embedded, attention_mask)

        # Apply final LayerNorm and word projection
        raw_output = self.classifier(self.final_norm(embedded))
        # Apply mean pooling across the sequence to aggregate the output
        pooled = raw_output.mean(dim=1)
        pooled = self.pred_layer(pooled)

        return pooled


---
## Experiment 1

In [None]:
model = EncoderDecoderClassifier(vocab_size=len(tokenizer), d_model=config['d_model'], nhead=config['nhead'], dim_feedforward=config['dim_feedforward'], num_labels=2).to(device)
optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])

In [None]:
print("======= before training =======")
eval_loss, eval_acc = evaluate(model, eval_dataloader, criterion)
print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc * 100:.2f}%")
print("======= after training =======")
for epoch in range(3):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    eval_loss, eval_acc = evaluate(model, eval_dataloader, criterion)

    print(f"Epoch {epoch + 1}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc * 100:.2f}%")
    print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc * 100:.2f}%")



Evaluating: 100%|██████████| 4/4 [00:00<00:00,  4.73it/s]


Eval Loss: 0.7048, Eval Accuracy: 46.88%


Training: 100%|██████████| 4/4 [00:00<00:00,  5.15it/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 27.48it/s]


Epoch 1
Train Loss: 0.7048, Train Accuracy: 46.88%
Eval Loss: 0.7007, Eval Accuracy: 46.88%


Training: 100%|██████████| 4/4 [00:00<00:00, 13.40it/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 26.76it/s]


Epoch 2
Train Loss: 0.6964, Train Accuracy: 46.88%
Eval Loss: 0.6959, Eval Accuracy: 46.88%


Training: 100%|██████████| 4/4 [00:00<00:00, 13.09it/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 27.70it/s]

Epoch 3
Train Loss: 0.6915, Train Accuracy: 42.19%
Eval Loss: 0.6931, Eval Accuracy: 56.25%





---
## Experiment 2

In [None]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print("======= before training =======")
eval_loss, eval_acc = evaluate(model, eval_dataloader, criterion)
print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc * 100:.2f}%")
print("======= after training =======")
for epoch in range(3):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    eval_loss, eval_acc = evaluate(model, eval_dataloader, criterion)

    print(f"Epoch {epoch + 1}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc * 100:.2f}%")
    print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc * 100:.2f}%")



Evaluating: 100%|██████████| 4/4 [00:01<00:00,  2.32it/s]


Eval Loss: 0.7423, Eval Accuracy: 53.12%


Training: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  2.34it/s]


Epoch 1
Train Loss: 0.7334, Train Accuracy: 53.12%
Eval Loss: 0.6959, Eval Accuracy: 54.69%


Training: 100%|██████████| 4/4 [00:05<00:00,  1.37s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  2.31it/s]


Epoch 2
Train Loss: 0.6454, Train Accuracy: 65.62%
Eval Loss: 0.6826, Eval Accuracy: 59.38%


Training: 100%|██████████| 4/4 [00:05<00:00,  1.40s/it]
Evaluating: 100%|██████████| 4/4 [00:01<00:00,  2.20it/s]

Epoch 3
Train Loss: 0.6339, Train Accuracy: 67.19%
Eval Loss: 0.6702, Eval Accuracy: 64.06%





---
## Experiement 3

In [None]:
context_length = tokenizer.model_max_length
num_blocks = 6  # Transformer layer

model = GUPT(vocab_size=len(tokenizer), context_length=context_length, d_model=config['d_model'], nhead=config['nhead'], dim_feedforward=config['dim_feedforward'], num_blocks=num_blocks, num_labels=2).to(device)
optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])

In [None]:
criterion = nn.CrossEntropyLoss()
print("======= before training =======")
eval_loss, eval_acc = evaluate(model, eval_dataloader, criterion)
print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc * 100:.2f}%")
print("======= after training =======")
for epoch in range(3):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    eval_loss, eval_acc = evaluate(model, eval_dataloader, criterion)

    print(f"Epoch {epoch + 1}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc * 100:.2f}%")
    print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_acc * 100:.2f}%")



Evaluating: 100%|██████████| 4/4 [00:00<00:00, 10.98it/s]


Eval Loss: 0.6981, Eval Accuracy: 46.88%


Training: 100%|██████████| 4/4 [00:00<00:00,  6.24it/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 12.72it/s]


Epoch 1
Train Loss: 0.7028, Train Accuracy: 46.88%
Eval Loss: 0.6985, Eval Accuracy: 46.88%


Training: 100%|██████████| 4/4 [00:00<00:00,  6.48it/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 12.90it/s]


Epoch 2
Train Loss: 0.6989, Train Accuracy: 46.88%
Eval Loss: 0.6982, Eval Accuracy: 46.88%


Training: 100%|██████████| 4/4 [00:00<00:00,  6.78it/s]
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 12.86it/s]

Epoch 3
Train Loss: 0.6958, Train Accuracy: 46.88%
Eval Loss: 0.6978, Eval Accuracy: 46.88%





## Reference
1. https://ithelp.ithome.com.tw/articles/10298638
2. https://ithelp.ithome.com.tw/articles/10301854
2. https://github.com/kapadias/medium-articles/blob/master/natural-language-processing/transformers-series/sentiment_analysis_bert.ipynb
3. https://medium.com/ching-i/transformer-attention-is-all-you-need-c7967f38af14