### Import Required Classes

In [1]:
import torch
import torch.nn as nn
from Architectures.Basic_Sequence_classification import Sequence_Classification

### Prepare Model

In [2]:
class Config:
    def __init__(self,
                vocab_size,
                embed_dim=128,
                num_layers=10,
                num_heads=8,
                ff_dim=512,
                pre_normalization=True,
                max_position_embeddings=128,
                dropout_prob=0.1,
                num_labels=6,
                device = "cuda"):
        self.device = device
        self.vocab_size = vocab_size  # Tokenizer vocab size
        self.embed_dim = embed_dim  # Embedding & input to attention
        self.num_layers = num_layers  # Number of encoder layers
        self.num_heads = num_heads  # Number of heads in Multi-Head Attention
        self.ff_dim = ff_dim  # Feed Forward hidden dimension
        self.pre_normalization = pre_normalization  # LayerNorm before or after attention/FFN
        self.max_length = max_position_embeddings  # Max sequence length
        self.dropout_prob = dropout_prob  # Dropout probability
        self.num_classess = num_labels  # Output classes (for classification)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model = Sequence_Classification(config=Config(vocab_size=30522)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
loss_fn = nn.CrossEntropyLoss()

In [5]:
model.load_state_dict(torch.load("best_model.pt"))
model.classification_head[3] = nn.Linear(in_features=128, out_features=2)

  model.load_state_dict(torch.load("best_model.pt"))


In [6]:
print(model)

Sequence_Classification(
  (position_embedding): SinusoidalEmbeddingLayer(
    (embedding): Embedding(30522, 128)
    (layer_norm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-9): 10 x TransformerEncoderLayer(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=128, out_features=128, bias=True)
          (key): Linear(in_features=128, out_features=128, bias=True)
          (value): Linear(in_features=128, out_features=128, bias=True)
          (output): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): FeedForward2(
          (fc1): Linear(in_features=128, out_features=512, bias=True)
          (fc2): Linear(in_features=512, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (gelu): GELU(approximate='none')
        )
        (norm1): LayerNorm((128,), eps=1e-05, elemen

### Prepare Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("hungnm/multilingual-amazon-review-sentiment-processed", token="")

In [8]:
ds_en = ds.filter(lambda example: example['language'] == 'en')

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 128

In [10]:
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=MAX_LEN)

encoded_dataset = ds_en.map(encode)
ds_en

Map:   0%|          | 0/7792 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['stars', 'text', 'language', 'label', 'len', 'valid', 'input_ids', 'attention_mask'],
        num_rows: 216904
    })
    validation: Dataset({
        features: ['stars', 'text', 'language', 'label', 'len', 'valid', 'input_ids', 'attention_mask'],
        num_rows: 7792
    })
    test: Dataset({
        features: ['stars', 'text', 'language', 'label', 'len', 'valid', 'input_ids', 'attention_mask'],
        num_rows: 7811
    })
})

In [11]:
len(encoded_dataset['test']['attention_mask'][0])

128

In [22]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['stars', 'text', 'language', 'label', 'len', 'valid', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 216904
    })
    validation: Dataset({
        features: ['stars', 'text', 'language', 'label', 'len', 'valid', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 7792
    })
    test: Dataset({
        features: ['stars', 'text', 'language', 'label', 'len', 'valid', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 7811
    })
})

In [12]:
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [23]:
class Amazon_Dataset(Dataset):
    def __init__(self, dataset):
        self.data = dataset  # HuggingFace Dataset object

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]  # Access row as dict
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['label'])
        }

train_dataset = Amazon_Dataset(encoded_dataset['train'])
val_dataset = Amazon_Dataset(encoded_dataset['validation'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32)

In [14]:
len(val_dataset)

7792

In [15]:
len(val_loader)

244

### Train and Validate

In [26]:
model = model.to(device)

In [24]:
from sklearn.metrics import accuracy_score, f1_score

In [29]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits if hasattr(outputs, 'logits') else outputs
        
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx} Loss: {loss.item():.4f}")
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")
    
    # Validation
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=1)
            
            preds.extend(predictions.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='macro')
    print(f"Validation Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")


Batch 0 Loss: 0.3060
Batch 100 Loss: 0.3129
Batch 200 Loss: 0.3244
Batch 300 Loss: 0.4366
Batch 400 Loss: 0.3293
Batch 500 Loss: 0.1957
Batch 600 Loss: 0.5469
Batch 700 Loss: 0.2543
Batch 800 Loss: 0.4415
Batch 900 Loss: 0.2342
Batch 1000 Loss: 0.2883
Batch 1100 Loss: 0.3154
Batch 1200 Loss: 0.3352
Batch 1300 Loss: 0.3172
Batch 1400 Loss: 0.1305
Batch 1500 Loss: 0.2240
Batch 1600 Loss: 0.2258
Batch 1700 Loss: 0.2931
Batch 1800 Loss: 0.2104
Batch 1900 Loss: 0.4190
Batch 2000 Loss: 0.1664
Batch 2100 Loss: 0.4007
Batch 2200 Loss: 0.2938
Batch 2300 Loss: 0.1625
Batch 2400 Loss: 0.2281
Batch 2500 Loss: 0.2306
Batch 2600 Loss: 0.1883
Batch 2700 Loss: 0.3605
Batch 2800 Loss: 0.1346
Batch 2900 Loss: 0.1912
Batch 3000 Loss: 0.2180
Batch 3100 Loss: 0.2077
Batch 3200 Loss: 0.1682
Batch 3300 Loss: 0.2153
Batch 3400 Loss: 0.3466
Batch 3500 Loss: 0.2246
Batch 3600 Loss: 0.2531
Batch 3700 Loss: 0.2182
Batch 3800 Loss: 0.3488
Batch 3900 Loss: 0.4703
Batch 4000 Loss: 0.2481
Batch 4100 Loss: 0.2766
Batc

In [30]:
# Save entire model
torch.save(model.state_dict(), 'model_weights.pt')

In [31]:
import torch

# Example dummy input for BERT-like models (batch_size=1, sequence_length=128)
dummy_input_ids = torch.randint(0, 1000, (1, 128)).to(device)
dummy_attention_mask = torch.ones((1, 128)).to(device)

In [34]:
torch.onnx.export(
    model,  # Your model
    (dummy_input_ids, dummy_attention_mask),  # Tuple of inputs
    "sequence_classifier.onnx",  # Output file
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence_length'},
        'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
        'logits': {0: 'batch_size'}
    },
    opset_version=13,
    export_params=True,
    do_constant_folding=True
)


  scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))


In [33]:
!pip install onnx

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting onnx
  Downloading onnx-1.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Downloading onnx-1.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m[31m2.4 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.18.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
