### Step 1. prepare Dataset + DataLoader

In [12]:
import torch
from torch.utils.data import Dataset

class Twitter15Dataset(Dataset):
    def __init__(self, graph_data_list):
        """
        Custom dataset for Twitter15 graph data.

        Args:
            graph_data_list (list): List of graph samples. 
                                    Each graph is a dictionary with keys: 
                                    - 'x': node feature matrix (Tensor of shape [seq_len, feature_dim])
                                    - 'edge_index': edge list (not used here)
                                    - 'y': label (int in [0, 3])
        """
        self.graphs = graph_data_list

    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.graphs)

    def __getitem__(self, idx):
        # Return the features and label for a given index
        graph = self.graphs[idx]
        x = graph['x']  # Feature tensor: (sequence_length, feature_dim)
        y = graph['y']  # Label: integer in range [0, 3]
        return x, y


def collate_fn(batch):
    """
    Collate function to handle batches of variable-length sequences.
    Pads each sequence in the batch to the length of the longest sequence.

    Args:
        batch (list of tuples): Each tuple is (x, y), where
                                - x: Tensor of shape (seq_len, feature_dim)
                                - y: label (int)

    Returns:
        padded_xs (Tensor): Padded input features (batch_size, max_seq_len, feature_dim)
        masks (Tensor): Boolean masks indicating valid positions (batch_size, max_seq_len)
        ys (Tensor): Tensor of labels (batch_size,)
    """
    xs, ys = zip(*batch)

    # Determine the maximum sequence length in this batch
    max_len = max(x.shape[0] for x in xs)
    feature_dim = xs[0].shape[1]

    padded_xs = []
    masks = []

    for x in xs:
        seq_len = x.shape[0]
        pad_len = max_len - seq_len

        if pad_len > 0:
            # Pad with zeros at the end of the sequence
            pad = torch.zeros((pad_len, feature_dim), dtype=x.dtype)
            x_padded = torch.cat([x, pad], dim=0)
        else:
            x_padded = x

        # Create a mask where True indicates valid (non-padded) positions
        mask = torch.cat([torch.ones(seq_len), torch.zeros(pad_len)]).bool()

        padded_xs.append(x_padded)
        masks.append(mask)

    # Stack all sequences and masks into tensors
    padded_xs = torch.stack(padded_xs)    # Shape: (batch_size, max_len, feature_dim)
    masks = torch.stack(masks)            # Shape: (batch_size, max_len)
    ys = torch.tensor(ys)                 # Shape: (batch_size,)

    return padded_xs, masks, ys


In [13]:
from sklearn.model_selection import train_test_split
import torch

# Load your cleaned graph data
graph_data_list = torch.load("../processed/twitter15_graph_data_clean.pt", weights_only=False)

# Split into Train / Validation / Test sets in a 7:1.5:1.5 ratio
train_graphs, temp_graphs = train_test_split(graph_data_list, test_size=0.3, random_state=42)
val_graphs, test_graphs = train_test_split(temp_graphs, test_size=0.5, random_state=42)

# Output the number of samples in each split
print(f"Train: {len(train_graphs)}, Val: {len(val_graphs)}, Test: {len(test_graphs)}")


Train: 1043, Val: 223, Test: 224


In [14]:
from torch.utils.data import DataLoader

batch_size = 16

# Build Dataset objects for training, validation, and test sets
train_dataset = Twitter15Dataset(train_graphs)
val_dataset = Twitter15Dataset(val_graphs)
test_dataset = Twitter15Dataset(test_graphs)

# Build DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print("DataLoaders created successfully!")


DataLoaders created successfully!


### Step 2: Transformer

In [18]:
import torch.nn as nn

class TransformerClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_dim, num_classes, num_heads=8, num_layers=2, dropout=0.1):
        """
        Initializes the Transformer-based classifier model.

        Args:
            feature_dim (int): The dimension of input features.
            hidden_dim (int): The dimension of the feedforward network within the transformer encoder.
            num_classes (int): The number of output classes for classification.
            num_heads (int, optional): The number of attention heads in the transformer encoder. Default is 8.
            num_layers (int, optional): The number of layers in the transformer encoder. Default is 2.
            dropout (float, optional): The dropout rate for regularization. Default is 0.1.
        """
        super(TransformerClassifier, self).__init__()
        
        # Projection layer to map input features to a fixed size (768)
        self.input_projection = nn.Linear(feature_dim, 768)  # Maps input feature dimension to 768
        
        # Create a transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=768,                 # Input dimension (should match the transformer model size)
            nhead=num_heads,             # Number of attention heads
            dim_feedforward=hidden_dim,  # Hidden layer size for the feedforward network
            dropout=dropout,             # Dropout rate for regularization
            batch_first=True             # The input tensor format will be (batch_size, seq_len, feature_dim)
        )
        
        # Transformer encoder which consists of multiple encoder layers
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers  # Number of encoder layers
        )
        
        # Fully connected layer for classification
        self.fc = nn.Linear(768, num_classes)  # Output layer mapping from transformer output to class scores
        self.dropout = nn.Dropout(dropout)     # Dropout layer for regularization

    def forward(self, x, mask):
        """
        Forward pass through the model.

        Args:
            x (tensor): Input feature tensor of shape (batch_size, seq_len, feature_dim).
            mask (tensor): Mask tensor of shape (batch_size, seq_len), where 1 indicates valid positions and 0 indicates padding.

        Returns:
            tuple: 
                - logits (tensor): The class scores, shape (batch_size, num_classes).
                - transformer_out (tensor): The output from the transformer encoder, shape (batch_size, seq_len, 768).
        """
        src_key_padding_mask = ~mask  # Create padding mask, inverted to match the expected format

        # Project the input features to the transformer input size (768)
        x = self.input_projection(x)  # Adds the projection layer to adjust feature dimensions

        # Pass the input through the transformer encoder
        transformer_out = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)
        
        # Pooling the transformer output by averaging over the sequence length
        pooled_output = transformer_out.mean(dim=1)  # Shape: (batch_size, 768)

        # Apply dropout for regularization
        output = self.dropout(pooled_output)
        
        # Final classification layer
        logits = self.fc(output)  # Shape: (batch_size, num_classes)

        return logits, transformer_out


### Step 3. Trainer

In [21]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score

def train_one_epoch(model, train_loader, optimizer, loss_fn, device):
    """
    Train the model for one epoch.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        optimizer (Optimizer): Optimizer to update model parameters.
        loss_fn (function): Loss function to compute the loss.
        device (torch.device): Device to move the data and model (e.g., 'cuda' or 'cpu').

    Returns:
        tuple: A tuple containing:
            - epoch_loss (float): The average loss for the epoch.
            - epoch_acc (float): The accuracy for the epoch.
            - epoch_f1 (float): The F1 score for the epoch (macro average).
    """
    model.train()  # Set the model to training mode
    running_loss = 0.0
    all_preds = []
    all_labels = []

    for x, mask, y in train_loader:
        x = x.to(device)  # Move data to the specified device
        mask = mask.to(device)
        y = y.to(device)

        # Replace NaN values in x with 0
        x = torch.nan_to_num(x, nan=0.0)

        # Forward pass
        logits, _ = model(x, mask)

        # Compute the loss
        loss = loss_fn(logits, y)

        optimizer.zero_grad()  # Zero the gradients
        loss.backward()  # Backpropagation
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Clip gradients to avoid exploding gradients
        optimizer.step()  # Update model parameters

        running_loss += loss.item() * x.size(0)  # Accumulate loss

        # Get predictions
        preds = logits.argmax(dim=-1)
        all_preds.extend(preds.detach().cpu().tolist())  # Detach tensor and move to CPU for storing predictions
        all_labels.extend(y.cpu().tolist())  # Store true labels

    # Calculate metrics
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)  # Accuracy calculation
    epoch_f1 = f1_score(all_labels, all_preds, average='macro')  # F1 score calculation (macro average)

    return epoch_loss, epoch_acc, epoch_f1

def evaluate_one_epoch(model, val_loader, loss_fn, device):
    """
    Evaluate the model for one epoch.

    Args:
        model (nn.Module): The model to evaluate.
        val_loader (DataLoader): DataLoader for the validation dataset.
        loss_fn (function): Loss function to compute the loss.
        device (torch.device): Device to move the data and model (e.g., 'cuda' or 'cpu').

    Returns:
        tuple: A tuple containing:
            - epoch_loss (float): The average loss for the epoch.
            - epoch_acc (float): The accuracy for the epoch.
            - epoch_f1 (float): The F1 score for the epoch (macro average).
    """
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for x, mask, y in val_loader:
            x = x.to(device)  # Move data to the specified device
            mask = mask.to(device)
            y = y.to(device)

            # Replace NaN values in x with 0
            x = torch.nan_to_num(x, nan=0.0)

            # Forward pass
            logits, _ = model(x, mask)
            logits = torch.clamp(logits, min=-10, max=10)  # Clamp logits to avoid large values

            # Compute the loss
            loss = loss_fn(logits, y)

            running_loss += loss.item() * x.size(0)  # Accumulate loss

            # Get predictions
            preds = logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().tolist())  # Detach tensor and move to CPU for storing predictions
            all_labels.extend(y.cpu().tolist())  # Store true labels

    # Calculate metrics
    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)  # Accuracy calculation
    epoch_f1 = f1_score(all_labels, all_preds, average='macro')  # F1 score calculation (macro average)

    return epoch_loss, epoch_acc, epoch_f1


class EarlyStopping:
    """
    EarlyStopping class to monitor validation performance and stop training early when the performance does not improve.

    Args:
        patience (int): The number of epochs to wait for improvement before stopping.
        verbose (bool): Whether to print messages when early stopping is triggered.
        delta (float): Minimum change to qualify as an improvement.
    """
    def __init__(self, patience=10, verbose=False, delta=0.0):
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.counter = 0  # Counter to track the number of epochs without improvement
        self.best_score = None  # Best validation score observed so far
        self.early_stop = False  # Flag to indicate early stopping
        self.best_f1 = -float('inf')  # Best F1 score observed so far

    def __call__(self, val_f1, model, save_path):
        """
        Checks if early stopping should be triggered based on validation F1 score.

        Args:
            val_f1 (float): The current validation F1 score.
            model (nn.Module): The model being trained.
            save_path (str): The path to save the model checkpoint.
        """
        score = val_f1

        # Check if this is the first epoch or if the score has improved
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_f1, model, save_path)
        elif score < self.best_score + self.delta:  # If no improvement
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:  # If patience is exceeded, stop early
                self.early_stop = True
        else:  # If the score improved, reset the counter and update the best score
            self.best_score = score
            self.save_checkpoint(val_f1, model, save_path)
            self.counter = 0

    def save_checkpoint(self, val_f1, model, save_path):
        """
        Saves the model checkpoint if validation F1 score improves.

        Args:
            val_f1 (float): The current validation F1 score.
            model (nn.Module): The model being trained.
            save_path (str): The path to save the model checkpoint.
        """
        torch.save(model.state_dict(), save_path)  # Save the model state dict
        self.best_f1 = val_f1  # Update the best F1 score


### Step 4: Runner

In [22]:
import os
import torch
import torch.nn.functional as F

# Check for CUDA availability and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the path where the best model will be saved
save_path = os.path.abspath("../checkpoints/best_transformer_model.pt")

# Hyperparameters
hidden_dim = 128  # Hidden dimension for the feedforward layer
num_classes = 4  # Number of output classes
learning_rate = 5e-5  # Learning rate for Adam optimizer
weight_decay = 1e-2  # Weight decay (L2 regularization)
max_epochs = 1000  # Maximum number of epochs
patience = 10  # Patience for early stopping

# Model initialization
feature_dim = 833  # Input feature dimension (set according to your data)
model = TransformerClassifier(
    feature_dim=feature_dim,
    hidden_dim=256,     # You can increase this value to make the hidden layers of transformer larger
    num_classes=num_classes,
    num_heads=4,        # Number of attention heads
    num_layers=2,       # Number of transformer layers
    dropout=0.1         # Dropout rate
).to(device)

# Optimizer: AdamW with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Loss function: Custom smooth cross-entropy loss with label smoothing
def smooth_cross_entropy(preds, targets, smoothing=0.1):
    """
    Calculate smooth cross-entropy loss with label smoothing.

    Args:
        preds (tensor): Predicted logits (batch_size, num_classes)
        targets (tensor): Ground truth labels (batch_size,)
        smoothing (float): Smoothing factor for labels

    Returns:
        tensor: The mean smooth cross-entropy loss
    """
    confidence = 1.0 - smoothing
    logprobs = F.log_softmax(preds, dim=-1)  # Log of predicted probabilities
    nll_loss = -logprobs.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)  # Negative log-likelihood loss
    smooth_loss = -logprobs.mean(dim=-1)  # Mean of the log probabilities (smooth part)
    loss = confidence * nll_loss + smoothing * smooth_loss  # Weighted sum of NLL and smooth loss
    return loss.mean()

# Set the custom loss function
loss_fn = smooth_cross_entropy

# Initialize EarlyStopping object to monitor validation F1 score
early_stopper = EarlyStopping(patience=patience, verbose=True)

# Training loop
for epoch in range(1, max_epochs + 1):
    # Train for one epoch
    train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, loss_fn, device)
    
    # Evaluate on validation set
    val_loss, val_acc, val_f1 = evaluate_one_epoch(model, val_loader, loss_fn, device)

    # Print training and validation metrics for the current epoch
    print(f"Epoch {epoch}:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}")
    print(f"  Val   Loss: {val_loss:.4f} | Val   Acc: {val_acc:.4f} | Val   F1: {val_f1:.4f}")

    # Call EarlyStopping to check if training should be stopped
    early_stopper(val_f1, model, save_path)

    # Stop training early if the performance does not improve
    if early_stopper.early_stop:
        print("Early stopping triggered!")
        break

# End of training
print("Training completed.")


  output = torch._nested_tensor_from_mask(


Epoch 1:
  Train Loss: 1.3873 | Train Acc: 0.3557 | Train F1: 0.3545
  Val   Loss: 1.3492 | Val   Acc: 0.2960 | Val   F1: 0.1640
Epoch 2:
  Train Loss: 1.1785 | Train Acc: 0.5014 | Train F1: 0.4958
  Val   Loss: 1.3182 | Val   Acc: 0.3004 | Val   F1: 0.1620
EarlyStopping counter: 1 out of 10
Epoch 3:
  Train Loss: 1.0932 | Train Acc: 0.5781 | Train F1: 0.5721
  Val   Loss: 1.2857 | Val   Acc: 0.4933 | Val   F1: 0.4708
Epoch 4:
  Train Loss: 0.9971 | Train Acc: 0.6520 | Train F1: 0.6484
  Val   Loss: 1.2808 | Val   Acc: 0.4170 | Val   F1: 0.3521
EarlyStopping counter: 1 out of 10
Epoch 5:
  Train Loss: 0.9067 | Train Acc: 0.6855 | Train F1: 0.6833
  Val   Loss: 1.3070 | Val   Acc: 0.3363 | Val   F1: 0.2311
EarlyStopping counter: 2 out of 10
Epoch 6:
  Train Loss: 0.8266 | Train Acc: 0.7555 | Train F1: 0.7541
  Val   Loss: 1.2557 | Val   Acc: 0.4126 | Val   F1: 0.3503
EarlyStopping counter: 3 out of 10
Epoch 7:
  Train Loss: 0.7364 | Train Acc: 0.8102 | Train F1: 0.8087
  Val   Loss: 1.2

In [24]:
# Load the trained best model from the saved checkpoint
model.load_state_dict(torch.load(save_path))  # Load the model's state_dict
model.to(device)  # Move model to the appropriate device (CPU or GPU)
model.eval()  # Set model to evaluation mode (disables dropout, etc.)

# Define the testing function
def test_model(model, test_loader, loss_fn, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    all_preds = []
    all_labels = []

    # Disable gradient computation during testing (saves memory and computations)
    with torch.no_grad():
        for x, mask, y in test_loader:
            x = x.to(device)  # Move input data to the correct device
            mask = mask.to(device)  # Move mask data to the correct device
            y = y.to(device)  # Move target labels to the correct device

            x = torch.nan_to_num(x, nan=0.0)  # Replace NaNs with 0s

            # Pass the data through the model and get logits
            logits, _ = model(x, mask)

            # Clamp logits to a certain range for numerical stability
            logits = torch.clamp(logits, min=-10, max=10)

            # Calculate loss using the loss function
            loss = loss_fn(logits, y)

            running_loss += loss.item() * x.size(0)  # Accumulate loss

            # Get predictions by taking the argmax (class with the highest probability)
            preds = logits.argmax(dim=-1)

            # Append predictions and true labels to lists
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(y.cpu().tolist())

    # Compute the average loss over the entire test set
    test_loss = running_loss / len(test_loader.dataset)

    # Calculate accuracy and F1 score
    test_acc = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='macro')

    return test_loss, test_acc, test_f1

# Call the test function
test_loss, test_acc, test_f1 = test_model(model, test_loader, loss_fn, device)

# Print the final test results
print("=== Final Test Results ===")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")


=== Final Test Results ===
Test Loss: 1.2331
Test Accuracy: 0.5223
Test F1 Score: 0.5218
